diff --git a/packages/dashboard/CHANGELOG.md b/packages/dashboard/CHANGELOG.md index 9b244c58..b9e5c2bf 100644 --- a/packages/dashboard/CHANGELOG.md +++ b/packages/dashboard/CHANGELOG.md @@ -2,4 +2,15 @@ ## 0.1.0 +- Reworked the dashboard into a richer operations console with dedicated views + for tasks, jobs, workflows, workers, failures, audit, events, namespaces, and + search. +- Refactored UI rendering into modular page components and shared table/layout + primitives for better maintainability. +- Introduced a full Tailwind-based styling system and updated responsive layout + behavior for sidebar/header/content rendering. +- Improved navigation and Turbo frame behavior to reduce stale-content flashes + during page switches. +- Expanded dashboard state/service/server models and test coverage to support + the new views and metadata-rich rendering paths. - Initial release of the `stem_dashboard` package. diff --git a/packages/dashboard/README.md b/packages/dashboard/README.md index 79cea7c4..59ba1183 100644 --- a/packages/dashboard/README.md +++ b/packages/dashboard/README.md @@ -36,6 +36,7 @@ Environment variables mirror the Stem CLI: - `STEM_RESULT_BACKEND_URL` (defaults to the broker URL when omitted) - `STEM_NAMESPACE` / `STEM_DASHBOARD_NAMESPACE` (defaults to `stem`) - `STEM_TLS_*` for TLS-enabled Redis endpoints +- `DASHBOARD_BASE_PATH` (optional mount prefix such as `/dashboard`) Because the dashboard reuses `StemConfig`, any broker/result backend supported by Stem (`redis://`, `rediss://`, `postgres://`, `postgresql://`, `memory://`) @@ -45,6 +46,37 @@ The events page keeps a websocket open to `/dash/streams` so new queue/worker deltas appear instantly without refreshing. Tasks and workers pages use Turbo Frames for navigation and sorting. +## Library Embedding + +`stem_dashboard` can run standalone (via `runDashboardServer`) or be mounted +into an existing `routed` engine: + +```dart +import 'package:routed/routed.dart'; +import 'package:stem_dashboard/dashboard.dart'; + +Future main() async { + final service = await StemDashboardService.connect(); + final state = DashboardState(service: service); + await state.start(); + + final engine = Engine(); + mountDashboard( + engine: engine, + service: service, + state: state, + options: const DashboardMountOptions(basePath: '/dashboard'), + ); + + await engine.serve(host: '127.0.0.1', port: 8080); +} +``` + +For embedded usage, the host app owns lifecycle: + +- call `state.start()` before serving. +- call `state.dispose()` and `service.close()` on shutdown. + ### Local dependency overrides `pubspec.yaml` contains overrides pointing at the local Stem packages so the diff --git a/packages/dashboard/bin/dashboard.dart b/packages/dashboard/bin/dashboard.dart index 547cccff..b75ce116 100644 --- a/packages/dashboard/bin/dashboard.dart +++ b/packages/dashboard/bin/dashboard.dart @@ -6,6 +6,7 @@ Future main(List args) async { final host = Platform.environment['DASHBOARD_HOST']?.trim(); final portRaw = Platform.environment['DASHBOARD_PORT']?.trim(); final echoRaw = Platform.environment['DASHBOARD_ECHO_ROUTES']?.trim(); + final basePath = Platform.environment['DASHBOARD_BASE_PATH']?.trim(); final resolvedHost = host != null && host.isNotEmpty ? host : '127.0.0.1'; final resolvedPort = int.tryParse(portRaw ?? '') ?? 3080; @@ -17,6 +18,7 @@ Future main(List args) async { host: resolvedHost, port: resolvedPort, echoRoutes: echoRoutes, + basePath: basePath ?? '', ), ); } diff --git a/packages/dashboard/lib/dashboard.dart b/packages/dashboard/lib/dashboard.dart index c0a9329b..70ac12e2 100644 --- a/packages/dashboard/lib/dashboard.dart +++ b/packages/dashboard/lib/dashboard.dart @@ -1,3 +1,11 @@ -export 'src/server.dart' show DashboardServerOptions, runDashboardServer; +export 'src/server.dart' + show + DashboardMountOptions, + DashboardServerOptions, + buildDashboardEngine, + mountDashboard, + registerDashboardRoutes, + runDashboardServer; export 'src/services/stem_service.dart' show DashboardDataSource, StemDashboardService; +export 'src/state/dashboard_state.dart' show DashboardState; diff --git a/packages/dashboard/lib/src/config/config.dart b/packages/dashboard/lib/src/config/config.dart index d4cc7d5a..94968562 100644 --- a/packages/dashboard/lib/src/config/config.dart +++ b/packages/dashboard/lib/src/config/config.dart @@ -11,6 +11,11 @@ class DashboardConfig { required this.stem, required this.namespace, required this.routing, + required this.alertWebhookUrls, + required this.alertBacklogThreshold, + required this.alertFailedTaskThreshold, + required this.alertOfflineWorkerThreshold, + required this.alertCooldown, }); /// Loads a dashboard config from the provided environment map. @@ -29,12 +34,37 @@ class DashboardConfig { final routing = RoutingConfigLoader( StemRoutingContext.fromConfig(stemConfig), ).load(); + final webhookUrls = _parseCsv( + env['STEM_DASHBOARD_ALERT_WEBHOOK_URLS'] ?? + env['STEM_DASHBOARD_WEBHOOK_URLS'], + ); + final backlogThreshold = _parsePositiveInt( + env['STEM_DASHBOARD_ALERT_BACKLOG_THRESHOLD'], + fallback: 500, + ); + final failedThreshold = _parsePositiveInt( + env['STEM_DASHBOARD_ALERT_FAILED_TASK_THRESHOLD'], + fallback: 25, + ); + final offlineThreshold = _parsePositiveInt( + env['STEM_DASHBOARD_ALERT_OFFLINE_WORKER_THRESHOLD'], + fallback: 1, + ); + final cooldown = _parseDuration( + env['STEM_DASHBOARD_ALERT_COOLDOWN'], + fallback: const Duration(minutes: 5), + ); return DashboardConfig._( environment: Map.unmodifiable(env), stem: stemConfig, namespace: namespace, routing: routing, + alertWebhookUrls: webhookUrls, + alertBacklogThreshold: backlogThreshold, + alertFailedTaskThreshold: failedThreshold, + alertOfflineWorkerThreshold: offlineThreshold, + alertCooldown: cooldown, ); } @@ -54,6 +84,21 @@ class DashboardConfig { /// Routing registry resolved for this dashboard session. final RoutingRegistry routing; + /// Alert webhook URLs. + final List alertWebhookUrls; + + /// Backlog alert threshold. + final int alertBacklogThreshold; + + /// Failed task alert threshold. + final int alertFailedTaskThreshold; + + /// Offline worker alert threshold. + final int alertOfflineWorkerThreshold; + + /// Alert cooldown. + final Duration alertCooldown; + /// Broker URL resolved from the underlying Stem config. String get brokerUrl => stem.brokerUrl; @@ -63,3 +108,39 @@ class DashboardConfig { /// TLS configuration resolved from the underlying Stem config. TlsConfig get tls => stem.tls; } + +List _parseCsv(String? raw) { + if (raw == null || raw.trim().isEmpty) return const []; + return raw + .split(',') + .map((value) => value.trim()) + .where((value) => value.isNotEmpty) + .toList(growable: false); +} + +int _parsePositiveInt(String? raw, {required int fallback}) { + if (raw == null || raw.trim().isEmpty) return fallback; + final parsed = int.tryParse(raw.trim()); + if (parsed == null || parsed <= 0) return fallback; + return parsed; +} + +Duration _parseDuration(String? raw, {required Duration fallback}) { + if (raw == null || raw.trim().isEmpty) return fallback; + final value = raw.trim(); + final match = RegExp(r'^(\d+)(ms|s|m|h)$').firstMatch(value); + if (match == null) return fallback; + final amount = int.tryParse(match.group(1) ?? ''); + if (amount == null || amount <= 0) return fallback; + switch (match.group(2)) { + case 'ms': + return Duration(milliseconds: amount); + case 's': + return Duration(seconds: amount); + case 'm': + return Duration(minutes: amount); + case 'h': + return Duration(hours: amount); + } + return fallback; +} diff --git a/packages/dashboard/lib/src/server.dart b/packages/dashboard/lib/src/server.dart index d1767abe..6d66b1f6 100644 --- a/packages/dashboard/lib/src/server.dart +++ b/packages/dashboard/lib/src/server.dart @@ -1,9 +1,11 @@ +import 'dart:async'; import 'dart:convert'; import 'dart:io'; import 'package:routed/routed.dart'; import 'package:routed_hotwire/routed_hotwire.dart'; -import 'package:stem/stem.dart' show generateEnvelopeId; +import 'package:stem/stem.dart' + show TaskState, generateEnvelopeId, stemLogContext, stemLogger; import 'package:stem_dashboard/src/config/config.dart'; import 'package:stem_dashboard/src/services/models.dart'; import 'package:stem_dashboard/src/services/stem_service.dart'; @@ -11,6 +13,19 @@ import 'package:stem_dashboard/src/state/dashboard_state.dart'; import 'package:stem_dashboard/src/stem/control_messages.dart'; import 'package:stem_dashboard/src/ui/content.dart'; import 'package:stem_dashboard/src/ui/layout.dart'; +import 'package:stem_dashboard/src/ui/overview.dart'; +import 'package:stem_dashboard/src/ui/paths.dart'; + +/// Mount options for embedding the dashboard in a host app. +class DashboardMountOptions { + /// Creates mount options. + const DashboardMountOptions({this.basePath = ''}); + + /// Prefix path used when mounting routes into a host app. + /// + /// Examples: `''` (root), `'/dashboard'`. + final String basePath; +} /// Options controlling how the dashboard server binds to the network. class DashboardServerOptions { @@ -19,6 +34,7 @@ class DashboardServerOptions { this.host = '127.0.0.1', this.port = 3080, this.echoRoutes = false, + this.basePath = '', }); /// Hostname or IP address for the HTTP server. @@ -30,12 +46,21 @@ class DashboardServerOptions { /// Whether to log each registered route on startup. final bool echoRoutes; + /// Prefix path used when serving the dashboard from a sub-route. + final String basePath; + /// Returns a copy with the provided fields replaced. - DashboardServerOptions copyWith({String? host, int? port, bool? echoRoutes}) { + DashboardServerOptions copyWith({ + String? host, + int? port, + bool? echoRoutes, + String? basePath, + }) { return DashboardServerOptions( host: host ?? this.host, port: port ?? this.port, echoRoutes: echoRoutes ?? this.echoRoutes, + basePath: basePath ?? this.basePath, ); } } @@ -53,7 +78,19 @@ Future runDashboardServer({ final dashboardService = service ?? await StemDashboardService.connect(resolvedConfig!); final stateOwner = state == null; - final dashboardState = state ?? DashboardState(service: dashboardService); + final dashboardState = + state ?? + DashboardState( + service: dashboardService, + alertWebhookUrls: resolvedConfig?.alertWebhookUrls ?? const [], + alertBacklogThreshold: resolvedConfig?.alertBacklogThreshold ?? 500, + alertFailedTaskThreshold: + resolvedConfig?.alertFailedTaskThreshold ?? 25, + alertOfflineWorkerThreshold: + resolvedConfig?.alertOfflineWorkerThreshold ?? 1, + alertCooldown: + resolvedConfig?.alertCooldown ?? const Duration(minutes: 5), + ); if (stateOwner) { await dashboardState.start(); @@ -61,10 +98,22 @@ Future runDashboardServer({ final engine = buildDashboardEngine( service: dashboardService, state: dashboardState, + basePath: options.basePath, ); + final resolvedBasePath = normalizeDashboardBasePath(options.basePath); + final dashboardUrlPath = dashboardRoute(resolvedBasePath, '/'); - stdout.writeln( - '[stem-dashboard] Starting on http://${options.host}:${options.port}', + stemLogger.info( + 'Starting dashboard server', + stemLogContext( + component: 'dashboard', + subsystem: 'server', + fields: { + 'host': options.host, + 'port': options.port, + 'basePath': dashboardUrlPath, + }, + ), ); try { @@ -73,7 +122,9 @@ Future runDashboardServer({ port: options.port, echo: options.echoRoutes, ); + await _waitForShutdownSignal(); } finally { + await engine.close(); if (stateOwner) { await dashboardState.dispose(); } @@ -83,15 +134,75 @@ Future runDashboardServer({ } } +Future _waitForShutdownSignal() async { + final completer = Completer(); + final subscriptions = >[]; + + void complete(ProcessSignal signal) { + stemLogger.info( + 'Shutdown signal received', + stemLogContext( + component: 'dashboard', + subsystem: 'server', + fields: {'signal': signal.toString()}, + ), + ); + if (!completer.isCompleted) { + completer.complete(); + } + } + + void watch(ProcessSignal signal) { + subscriptions.add(signal.watch().listen(complete)); + } + + watch(ProcessSignal.sigint); + if (!Platform.isWindows) { + watch(ProcessSignal.sigterm); + } + + try { + await completer.future; + } finally { + for (final subscription in subscriptions) { + await subscription.cancel(); + } + } +} + /// Constructs the dashboard engine with routes and Turbo streaming. Engine buildDashboardEngine({ required DashboardDataSource service, required DashboardState state, + String basePath = '', }) { final engine = Engine(); - _registerRoutes(engine, service, state); + mountDashboard( + engine: engine, + service: service, + state: state, + options: DashboardMountOptions(basePath: basePath), + ); + return engine; +} + +/// Mounts dashboard routes and websocket streams into an existing [engine]. +void mountDashboard({ + required Engine engine, + required DashboardDataSource service, + required DashboardState state, + DashboardMountOptions options = const DashboardMountOptions(), +}) { + final resolvedBasePath = normalizeDashboardBasePath(options.basePath); + registerDashboardRoutes( + engine, + service, + state, + basePath: resolvedBasePath, + ); + final streamPath = dashboardRoute(resolvedBasePath, '/dash/streams'); engine.ws( - '/dash/streams', + streamPath, TurboStreamSocketHandler( hub: state.hub, topicResolver: (context) => @@ -99,84 +210,414 @@ Engine buildDashboardEngine({ const ['stem-dashboard:events'], ), ); - return engine; } -void _registerRoutes( +/// Registers the dashboard HTTP routes on [engine]. +void registerDashboardRoutes( Engine engine, DashboardDataSource service, - DashboardState state, -) { + DashboardState state, { + String basePath = '', +}) { engine ..get( - '/', - (ctx) => _renderPage(ctx, DashboardPage.overview, service, state), + dashboardRoute(basePath, '/'), + (ctx) => _renderPage( + ctx, + DashboardPage.overview, + service, + state, + basePath: basePath, + ), + ) + ..get( + dashboardRoute(basePath, '/tasks'), + (ctx) => _renderPage( + ctx, + DashboardPage.tasks, + service, + state, + basePath: basePath, + ), + ) + ..get( + dashboardRoute(basePath, '/tasks/detail'), + (ctx) => _renderPage( + ctx, + DashboardPage.taskDetail, + service, + state, + basePath: basePath, + ), + ) + ..get( + dashboardRoute(basePath, '/tasks/inline'), + (ctx) => _renderTaskInline(ctx, service, basePath: basePath), + ) + ..get( + dashboardRoute(basePath, '/failures'), + (ctx) => _renderPage( + ctx, + DashboardPage.failures, + service, + state, + basePath: basePath, + ), + ) + ..get( + dashboardRoute(basePath, '/search'), + (ctx) => _renderPage( + ctx, + DashboardPage.search, + service, + state, + basePath: basePath, + ), + ) + ..get( + dashboardRoute(basePath, '/audit'), + (ctx) => _renderPage( + ctx, + DashboardPage.audit, + service, + state, + basePath: basePath, + ), + ) + ..get( + dashboardRoute(basePath, '/events'), + (ctx) => _renderPage( + ctx, + DashboardPage.events, + service, + state, + basePath: basePath, + ), + ) + ..get( + dashboardRoute(basePath, '/namespaces'), + (ctx) => _renderPage( + ctx, + DashboardPage.namespaces, + service, + state, + basePath: basePath, + ), ) ..get( - '/tasks', - (ctx) => _renderPage(ctx, DashboardPage.tasks, service, state), + dashboardRoute(basePath, '/workflows'), + (ctx) => _renderPage( + ctx, + DashboardPage.workflows, + service, + state, + basePath: basePath, + ), ) ..get( - '/events', - (ctx) => _renderPage(ctx, DashboardPage.events, service, state), + dashboardRoute(basePath, '/jobs'), + (ctx) => _renderPage( + ctx, + DashboardPage.jobs, + service, + state, + basePath: basePath, + ), ) ..get( - '/workers', - (ctx) => _renderPage(ctx, DashboardPage.workers, service, state), + dashboardRoute(basePath, '/workers'), + (ctx) => _renderPage( + ctx, + DashboardPage.workers, + service, + state, + basePath: basePath, + ), + ) + ..get( + dashboardRoute(basePath, '/partials/overview'), + (ctx) => _renderOverviewPartials(ctx, service, state, basePath: basePath), + ) + ..post( + dashboardRoute(basePath, '/tasks/enqueue'), + (ctx) => _enqueueTask(ctx, service, state, basePath: basePath), ) - ..post('/tasks/enqueue', (ctx) => _enqueueTask(ctx, service)) - ..post('/workers/control', (ctx) => _controlWorkers(ctx, service)) - ..post('/queues/replay', (ctx) => _replayDeadLetters(ctx, service)); + ..post( + dashboardRoute(basePath, '/tasks/action'), + (ctx) => _taskAction(ctx, service, state, basePath: basePath), + ) + ..post( + dashboardRoute(basePath, '/workers/control'), + (ctx) => _controlWorkers(ctx, service, state, basePath: basePath), + ) + ..post( + dashboardRoute(basePath, '/queues/replay'), + (ctx) => _replayDeadLetters(ctx, service, state, basePath: basePath), + ); +} + +Future _renderOverviewPartials( + EngineContext ctx, + DashboardDataSource service, + DashboardState state, { + required String basePath, +}) async { + try { + final queues = await service.fetchQueueSummaries(); + final workers = await service.fetchWorkerStatuses(); + final taskStatuses = await service.fetchTaskStatuses(limit: 300); + final sections = buildOverviewSections( + queues, + workers, + state.throughput, + taskStatuses, + defaultNamespace: _resolveDefaultNamespace(workers, taskStatuses), + ); + + final updates = [ + turboStreamReplace( + target: 'overview-metrics', + html: prefixDashboardUrlAttributes(sections.metrics, basePath), + ), + turboStreamReplace( + target: 'overview-namespaces', + html: prefixDashboardUrlAttributes(sections.namespaces, basePath), + ), + turboStreamReplace( + target: 'overview-queue-table', + html: prefixDashboardUrlAttributes(sections.topQueues, basePath), + ), + turboStreamReplace( + target: 'overview-workflows', + html: prefixDashboardUrlAttributes(sections.workflows, basePath), + ), + turboStreamReplace( + target: 'overview-jobs', + html: prefixDashboardUrlAttributes(sections.jobs, basePath), + ), + turboStreamReplace( + target: 'overview-latency-table', + html: prefixDashboardUrlAttributes(sections.latency, basePath), + ), + turboStreamReplace( + target: 'overview-recent-tasks', + html: prefixDashboardUrlAttributes(sections.recentTasks, basePath), + ), + ].join('\n'); + + return ctx.turboStream(updates); + } on Object catch (error, stack) { + stemLogger.error( + 'Failed to render overview partials', + stemLogContext( + component: 'dashboard', + subsystem: 'server', + fields: { + 'error': error.toString(), + 'stack': stack.toString(), + }, + ), + ); + return ctx.turboHtml( + '
Failed to refresh overview metrics.
', + statusCode: HttpStatus.internalServerError, + ); + } } Future _renderPage( EngineContext ctx, DashboardPage page, DashboardDataSource service, - DashboardState state, -) async { + DashboardState state, { + required String basePath, +}) async { final turbo = ctx.turbo; try { - final queues = page == DashboardPage.events - ? const [] - : await service.fetchQueueSummaries(); + final needsQueues = + page == DashboardPage.overview || + page == DashboardPage.tasks || + page == DashboardPage.workers || + page == DashboardPage.search || + page == DashboardPage.namespaces; + final queues = needsQueues + ? await service.fetchQueueSummaries() + : const []; final workers = - page == DashboardPage.overview || page == DashboardPage.workers + page == DashboardPage.overview || + page == DashboardPage.workers || + page == DashboardPage.search || + page == DashboardPage.namespaces ? await service.fetchWorkerStatuses() : const []; - final tasksOptions = page == DashboardPage.tasks + var tasksOptions = page == DashboardPage.tasks ? _parseTasksOptions(ctx.uri.queryParameters) : const TasksPageOptions(); + final failuresOptions = page == DashboardPage.failures + ? _parseFailuresOptions(ctx.uri.queryParameters) + : const FailuresPageOptions(); + + final searchOptions = page == DashboardPage.search + ? _parseSearchOptions(ctx.uri.queryParameters) + : const SearchPageOptions(); + final namespacesOptions = page == DashboardPage.namespaces + ? _parseNamespacesOptions(ctx.uri.queryParameters) + : const NamespacesPageOptions(); + final workflowsOptions = page == DashboardPage.workflows + ? _parseWorkflowsOptions(ctx.uri.queryParameters) + : const WorkflowsPageOptions(); + final jobsOptions = page == DashboardPage.jobs + ? _parseJobsOptions(ctx.uri.queryParameters) + : const JobsPageOptions(); final workersOptions = page == DashboardPage.workers ? _parseWorkersOptions(ctx.uri.queryParameters) : const WorkersPageOptions(); + List taskStatuses; + if (page == DashboardPage.tasks) { + final localFilteringNeeded = + tasksOptions.hasNamespaceFilter || + tasksOptions.hasTaskFilter || + tasksOptions.hasRunIdFilter; + if (!localFilteringNeeded) { + final pageRequest = await service.fetchTaskStatuses( + state: tasksOptions.stateFilter, + queue: tasksOptions.filter, + limit: tasksOptions.pageSize + 1, + offset: tasksOptions.offset, + ); + final hasNextPage = pageRequest.length > tasksOptions.pageSize; + taskStatuses = hasNextPage + ? pageRequest.take(tasksOptions.pageSize).toList(growable: false) + : pageRequest; + tasksOptions = tasksOptions.copyWith( + hasNextPage: hasNextPage, + hasPreviousPage: tasksOptions.page > 1, + ); + } else { + final source = tasksOptions.hasRunIdFilter + ? await service.fetchTaskStatusesForRun( + tasksOptions.runId!, + limit: 1000, + ) + : await service.fetchTaskStatuses( + state: tasksOptions.stateFilter, + queue: tasksOptions.filter, + limit: 1000, + ); + final filtered = _applyTaskViewFilters(source, tasksOptions); + final pageItems = filtered + .skip(tasksOptions.offset) + .take(tasksOptions.pageSize) + .toList(growable: false); + final hasNextPage = + filtered.length > tasksOptions.offset + pageItems.length; + taskStatuses = pageItems; + tasksOptions = tasksOptions.copyWith( + hasNextPage: hasNextPage, + hasPreviousPage: tasksOptions.page > 1, + ); + } + } else if (page == DashboardPage.failures) { + taskStatuses = await service.fetchTaskStatuses( + state: TaskState.failed, + queue: failuresOptions.queue, + limit: 300, + ); + } else if (page == DashboardPage.overview) { + taskStatuses = await service.fetchTaskStatuses(limit: 300); + } else if (page == DashboardPage.search) { + taskStatuses = await service.fetchTaskStatuses(limit: 500); + } else if (page == DashboardPage.namespaces) { + taskStatuses = await service.fetchTaskStatuses(limit: 600); + } else if (page == DashboardPage.workflows) { + taskStatuses = await service.fetchTaskStatuses(limit: 700); + } else if (page == DashboardPage.jobs) { + taskStatuses = await service.fetchTaskStatuses(limit: 700); + } else { + taskStatuses = const []; + } + + final taskDetail = page == DashboardPage.taskDetail + ? await service.fetchTaskStatus(ctx.uri.queryParameters['id'] ?? '') + : null; + final runId = ctx.uri.queryParameters['runId']?.trim().isNotEmpty ?? false + ? ctx.uri.queryParameters['runId']!.trim() + : taskDetail?.runId; + final runTimeline = page == DashboardPage.taskDetail && runId != null + ? await service.fetchTaskStatusesForRun(runId, limit: 250) + : const []; + final workflowRun = page == DashboardPage.taskDetail && runId != null + ? await service.fetchWorkflowRun(runId) + : null; + final workflowSteps = page == DashboardPage.taskDetail && runId != null + ? await service.fetchWorkflowSteps(runId) + : const []; + final content = buildPageContent( page: page, queues: queues, workers: workers, + taskStatuses: taskStatuses, + taskDetail: taskDetail, + runTimeline: runTimeline, + workflowRun: workflowRun, + workflowSteps: workflowSteps, + auditEntries: page == DashboardPage.search || page == DashboardPage.audit + ? state.auditEntries + : const [], throughput: page == DashboardPage.overview ? state.throughput : null, events: page == DashboardPage.events ? state.events : const [], + defaultNamespace: _resolveDefaultNamespace(workers, taskStatuses), tasksOptions: tasksOptions, workersOptions: workersOptions, + failuresOptions: failuresOptions, + searchOptions: searchOptions, + namespacesOptions: namespacesOptions, + workflowsOptions: workflowsOptions, + jobsOptions: jobsOptions, ); + final contentWithBasePath = prefixDashboardUrlAttributes(content, basePath); + final streamPath = dashboardRoute(basePath, '/dash/streams'); if (turbo.isFrameRequest) { - return ctx.turboFrame(renderFrame(page, content)); + return ctx.turboFrame(renderFrame(page, contentWithBasePath)); } - return ctx.turboHtml(renderLayout(page, content)); + return ctx.turboHtml( + renderLayout( + page, + contentWithBasePath, + basePath: basePath, + streamPath: streamPath, + ), + ); } on Object catch (error, stack) { - stderr - ..writeln( - '[stem-dashboard] Failed to render ${page.name} page: $error', - ) - ..writeln(stack); + stemLogger.error( + 'Failed to render dashboard page', + stemLogContext( + component: 'dashboard', + subsystem: 'server', + fields: { + 'page': page.name, + 'error': error.toString(), + 'stack': stack.toString(), + }, + ), + ); final errorContent = _renderErrorPanel(error); if (turbo.isFrameRequest) { - return ctx.turboFrame(renderFrame(page, errorContent)); + return ctx.turboFrame( + renderFrame(page, prefixDashboardUrlAttributes(errorContent, basePath)), + ); } - return ctx.turboHtml(renderLayout(page, errorContent)); + return ctx.turboHtml( + renderLayout( + page, + prefixDashboardUrlAttributes(errorContent, basePath), + basePath: basePath, + ), + ); } } @@ -193,15 +634,60 @@ String _renderErrorPanel(Object error) { '''; } +Future _renderTaskInline( + EngineContext ctx, + DashboardDataSource service, { + required String basePath, +}) async { + final taskId = (ctx.uri.queryParameters['id'] ?? '').trim(); + final target = _sanitizeDomTarget(ctx.uri.queryParameters['target'] ?? ''); + if (target.isEmpty) { + return ctx.turboHtml( + '
Missing inline target.
', + statusCode: HttpStatus.badRequest, + ); + } + + DashboardTaskStatusEntry? task; + if (taskId.isNotEmpty) { + task = await service.fetchTaskStatus(taskId); + } + + final content = prefixDashboardUrlAttributes( + buildTaskInlineContent(task), + basePath, + ); + final payload = + '
$content
'; + return ctx.turboStream(turboStreamReplace(target: target, html: payload)); +} + +String _sanitizeDomTarget(String raw) { + final trimmed = raw.trim(); + if (trimmed.isEmpty) return ''; + final validPattern = RegExp(r'^[A-Za-z][A-Za-z0-9:_-]*$'); + return validPattern.hasMatch(trimmed) ? trimmed : ''; +} + Future _enqueueTask( EngineContext ctx, DashboardDataSource service, -) async { + DashboardState state, { + required String basePath, +}) async { + final tasksPath = dashboardRoute(basePath, '/tasks'); try { final queue = (await ctx.postForm('queue')).trim(); final task = (await ctx.postForm('task')).trim(); if (queue.isEmpty || task.isEmpty) { - return ctx.turboSeeOther('/tasks?error=missing-fields'); + state.recordAudit( + kind: 'action', + action: 'task.enqueue', + status: 'error', + actor: 'dashboard', + summary: 'Task enqueue rejected: queue/task missing.', + ); + return ctx.turboSeeOther('$tasksPath?error=missing-fields'); } final payloadText = (await ctx.postForm('payload')).trim(); @@ -212,10 +698,24 @@ Future _enqueueTask( if (decoded is Map) { args = decoded; } else { - return ctx.turboSeeOther('/tasks?error=invalid-payload'); + state.recordAudit( + kind: 'action', + action: 'task.enqueue', + status: 'error', + actor: 'dashboard', + summary: 'Task enqueue rejected: payload not a JSON object.', + ); + return ctx.turboSeeOther('$tasksPath?error=invalid-payload'); } } on Object { - return ctx.turboSeeOther('/tasks?error=invalid-payload'); + state.recordAudit( + kind: 'action', + action: 'task.enqueue', + status: 'error', + actor: 'dashboard', + summary: 'Task enqueue rejected: invalid JSON payload.', + ); + return ctx.turboSeeOther('$tasksPath?error=invalid-payload'); } } @@ -235,12 +735,176 @@ Future _enqueueTask( maxRetries: maxRetries, ), ); - return ctx.turboSeeOther('/tasks?flash=queued'); + state.recordAudit( + kind: 'action', + action: 'task.enqueue', + status: 'ok', + actor: 'dashboard', + summary: 'Queued task "$task" on "$queue".', + metadata: {'queue': queue, 'task': task}, + ); + return ctx.turboSeeOther('$tasksPath?flash=queued'); + } on Object catch (error, stack) { + stemLogger.error( + 'Dashboard enqueue failed', + stemLogContext( + component: 'dashboard', + subsystem: 'server', + fields: { + 'error': error.toString(), + 'stack': stack.toString(), + }, + ), + ); + state.recordAudit( + kind: 'action', + action: 'task.enqueue', + status: 'error', + actor: 'dashboard', + summary: 'Task enqueue failed: $error', + ); + return ctx.turboSeeOther('$tasksPath?error=enqueue-failed'); + } +} + +Future _taskAction( + EngineContext ctx, + DashboardDataSource service, + DashboardState state, { + required String basePath, +}) async { + final redirect = _resolveRedirectPath( + await ctx.defaultPostForm('redirect', dashboardRoute(basePath, '/tasks')), + fallbackPath: dashboardRoute(basePath, '/tasks'), + ); + try { + final action = (await ctx.postForm('action')).trim().toLowerCase(); + final taskId = (await ctx.postForm('taskId')).trim(); + final queueRaw = (await ctx.defaultPostForm('queue', '')).trim(); + final queue = queueRaw.isEmpty ? null : queueRaw; + + if (taskId.isEmpty) { + state.recordAudit( + kind: 'action', + action: 'task.action', + status: 'error', + actor: 'dashboard', + summary: 'Task action rejected: missing task id.', + ); + return ctx.turboSeeOther( + _appendRedirectQuery(redirect, {'error': 'Task ID is required.'}), + ); + } + + switch (action) { + case 'cancel': + final reasonRaw = (await ctx.defaultPostForm( + 'reason', + 'Cancelled from dashboard.', + )).trim(); + final terminate = _isTruthy( + (await ctx.defaultPostForm('terminate', 'false')).trim(), + ); + final revoked = await service.revokeTask( + taskId, + terminate: terminate, + reason: reasonRaw.isEmpty ? null : reasonRaw, + ); + if (!revoked) { + state.recordAudit( + kind: 'action', + action: 'task.cancel', + status: 'error', + actor: 'dashboard', + summary: 'Failed to revoke task $taskId.', + metadata: {'taskId': taskId, 'queue': ?queue}, + ); + return ctx.turboSeeOther( + _appendRedirectQuery(redirect, { + 'error': 'Unable to revoke task $taskId.', + }), + ); + } + state.recordAudit( + kind: 'action', + action: 'task.cancel', + status: 'ok', + actor: 'dashboard', + summary: 'Revocation requested for $taskId.', + metadata: {'taskId': taskId, 'queue': ?queue}, + ); + return ctx.turboSeeOther( + _appendRedirectQuery(redirect, { + 'flash': 'Revocation requested for task $taskId.', + }), + ); + case 'replay': + final replayed = await service.replayTaskById(taskId, queue: queue); + if (!replayed) { + state.recordAudit( + kind: 'action', + action: 'task.replay', + status: 'error', + actor: 'dashboard', + summary: 'Task $taskId was not found in dead letters.', + metadata: {'taskId': taskId, 'queue': ?queue}, + ); + return ctx.turboSeeOther( + _appendRedirectQuery(redirect, { + 'error': 'Task $taskId was not found in dead letters.', + }), + ); + } + state.recordAudit( + kind: 'action', + action: 'task.replay', + status: 'ok', + actor: 'dashboard', + summary: 'Replayed dead-letter task $taskId.', + metadata: {'taskId': taskId, 'queue': ?queue}, + ); + return ctx.turboSeeOther( + _appendRedirectQuery(redirect, { + 'flash': 'Replayed dead-letter task $taskId as a new envelope.', + }), + ); + default: + state.recordAudit( + kind: 'action', + action: 'task.action', + status: 'error', + actor: 'dashboard', + summary: 'Unsupported task action "$action".', + metadata: {'taskId': taskId}, + ); + return ctx.turboSeeOther( + _appendRedirectQuery(redirect, { + 'error': 'Unsupported task action "$action".', + }), + ); + } } on Object catch (error, stack) { - stderr - ..writeln('[stem-dashboard] enqueue failed: $error') - ..writeln(stack); - return ctx.turboSeeOther('/tasks?error=enqueue-failed'); + stemLogger.error( + 'Dashboard task action failed', + stemLogContext( + component: 'dashboard', + subsystem: 'server', + fields: { + 'error': error.toString(), + 'stack': stack.toString(), + }, + ), + ); + state.recordAudit( + kind: 'action', + action: 'task.action', + status: 'error', + actor: 'dashboard', + summary: 'Task action failed: $error', + ); + return ctx.turboSeeOther( + _appendRedirectQuery(redirect, {'error': 'Task action failed.'}), + ); } } @@ -256,10 +920,38 @@ TasksPageOptions _parseTasksOptions(Map params) { final descending = direction == 'desc'; final filterRaw = params['queue']?.trim(); final filter = filterRaw == null || filterRaw.isEmpty ? null : filterRaw; + final namespaceRaw = params['namespace']?.trim(); + final namespaceFilter = namespaceRaw == null || namespaceRaw.isEmpty + ? null + : namespaceRaw; + final taskRaw = params['task']?.trim(); + final taskFilter = taskRaw == null || taskRaw.isEmpty ? null : taskRaw; + final runRaw = params['runId']?.trim(); + final runId = runRaw == null || runRaw.isEmpty ? null : runRaw; + final stateRaw = params['state']?.trim().toLowerCase(); + final stateFilter = switch (stateRaw) { + 'queued' => TaskState.queued, + 'running' => TaskState.running, + 'succeeded' => TaskState.succeeded, + 'failed' => TaskState.failed, + 'retried' => TaskState.retried, + 'cancelled' => TaskState.cancelled, + _ => null, + }; + final pageRaw = int.tryParse((params['page'] ?? '1').trim()); + final page = pageRaw == null || pageRaw < 1 ? 1 : pageRaw; + final pageSizeRaw = int.tryParse((params['pageSize'] ?? '25').trim()); + final pageSize = (pageSizeRaw ?? 25).clamp(25, 200); return TasksPageOptions( sortKey: sortKey, descending: descending, filter: filter, + namespaceFilter: namespaceFilter, + taskFilter: taskFilter, + runId: runId, + stateFilter: stateFilter, + page: page, + pageSize: pageSize, flashKey: params['flash']?.trim().isEmpty ?? false ? null : params['flash'], errorKey: params['error']?.trim().isEmpty ?? false ? null : params['error'], ); @@ -269,22 +961,145 @@ WorkersPageOptions _parseWorkersOptions(Map params) { final flash = params['flash']?.trim(); final error = params['error']?.trim(); final target = params['scope']?.trim(); + final namespace = params['namespace']?.trim(); return WorkersPageOptions( flashMessage: flash?.isNotEmpty ?? false ? flash : null, errorMessage: error?.isNotEmpty ?? false ? error : null, scope: target?.isNotEmpty ?? false ? target : null, + namespaceFilter: namespace?.isNotEmpty ?? false ? namespace : null, + ); +} + +FailuresPageOptions _parseFailuresOptions(Map params) { + final queue = params['queue']?.trim(); + final flash = params['flash']?.trim(); + final error = params['error']?.trim(); + return FailuresPageOptions( + queue: queue?.isEmpty ?? true ? null : queue, + flashMessage: flash?.isEmpty ?? true ? null : flash, + errorMessage: error?.isEmpty ?? true ? null : error, + ); +} + +SearchPageOptions _parseSearchOptions(Map params) { + final query = params['q']?.trim(); + final scopeRaw = (params['scope'] ?? 'all').trim().toLowerCase(); + final scope = switch (scopeRaw) { + 'tasks' => 'tasks', + 'workers' => 'workers', + 'queues' => 'queues', + 'audit' => 'audit', + _ => 'all', + }; + return SearchPageOptions( + query: query?.isEmpty ?? true ? null : query, + scope: scope, + ); +} + +NamespacesPageOptions _parseNamespacesOptions(Map params) { + final namespace = params['namespace']?.trim(); + return NamespacesPageOptions( + namespace: namespace?.isNotEmpty ?? false ? namespace : null, ); } +WorkflowsPageOptions _parseWorkflowsOptions(Map params) { + final workflow = params['workflow']?.trim(); + final runId = params['runId']?.trim(); + return WorkflowsPageOptions( + workflow: workflow?.isNotEmpty ?? false ? workflow : null, + runId: runId?.isNotEmpty ?? false ? runId : null, + ); +} + +JobsPageOptions _parseJobsOptions(Map params) { + final task = params['task']?.trim(); + final queue = params['queue']?.trim(); + return JobsPageOptions( + task: task?.isNotEmpty ?? false ? task : null, + queue: queue?.isNotEmpty ?? false ? queue : null, + ); +} + +List _applyTaskViewFilters( + List tasks, + TasksPageOptions options, +) { + final queueFilter = options.filter?.toLowerCase(); + final namespaceFilter = options.namespaceFilter?.toLowerCase(); + final taskFilter = options.taskFilter?.toLowerCase(); + final runFilter = options.runId?.toLowerCase(); + return tasks.where((entry) { + if (options.hasFilter) { + final queue = entry.queue.toLowerCase(); + if (!(queueFilter != null && queue.contains(queueFilter))) { + return false; + } + } + if (options.hasNamespaceFilter && + entry.namespace.toLowerCase() != namespaceFilter) { + return false; + } + if (options.hasTaskFilter) { + final name = entry.taskName.toLowerCase(); + if (!(taskFilter != null && name.contains(taskFilter))) { + return false; + } + } + if (options.hasRunIdFilter) { + final runId = entry.runId?.toLowerCase() ?? ''; + if (!(runFilter != null && runId.contains(runFilter))) { + return false; + } + } + if (options.hasStateFilter && entry.state != options.stateFilter) { + return false; + } + return true; + }).toList(growable: false); +} + +String _resolveDefaultNamespace( + List workers, + List tasks, +) { + for (final worker in workers) { + final value = worker.namespace.trim(); + if (value.isNotEmpty) return value; + } + for (final task in tasks) { + final value = task.namespace.trim(); + if (value.isNotEmpty) return value; + } + return 'stem'; +} + Future _controlWorkers( EngineContext ctx, DashboardDataSource service, -) async { + DashboardState state, { + required String basePath, +}) async { + final namespaceFilter = (await ctx.defaultPostForm('namespace', '')).trim(); + final workersPath = namespaceFilter.isEmpty + ? dashboardRoute(basePath, '/workers') + : _appendRedirectQuery( + dashboardRoute(basePath, '/workers'), + {'namespace': namespaceFilter}, + ); try { final rawAction = (await ctx.postForm('action')).trim().toLowerCase(); if (rawAction.isEmpty) { + state.recordAudit( + kind: 'action', + action: 'worker.control', + status: 'error', + actor: 'dashboard', + summary: 'Control action missing.', + ); return ctx.turboSeeOther( - '/workers?error=${Uri.encodeComponent('Control action missing.')}', + '$workersPath?error=${Uri.encodeComponent('Control action missing.')}', ); } @@ -313,10 +1128,17 @@ Future _controlWorkers( }; if (commandType == null) { + state.recordAudit( + kind: 'action', + action: 'worker.control', + status: 'error', + actor: 'dashboard', + summary: 'Unsupported control action "$rawAction".', + ); final encodedError = Uri.encodeComponent( 'Unsupported control action "$rawAction".', ); - return ctx.turboSeeOther('/workers?error=$encodedError'); + return ctx.turboSeeOther('$workersPath?error=$encodedError'); } final payload = {}; @@ -363,10 +1185,18 @@ Future _controlWorkers( if (primaryError is String && primaryError.isNotEmpty) { message.write(' Example: $primaryError'); } + state.recordAudit( + kind: 'action', + action: 'worker.control.$rawAction', + status: 'error', + actor: 'dashboard', + summary: + '$label command reached $scope with $errorReplies error replies.', + ); final encodedMessage = Uri.encodeComponent(message.toString()); final encodedScope = Uri.encodeComponent(scope); return ctx.turboSeeOther( - '/workers?error=$encodedMessage&scope=$encodedScope', + '$workersPath?error=$encodedMessage&scope=$encodedScope', ); } @@ -374,17 +1204,39 @@ Future _controlWorkers( final message = replies.isEmpty ? '$label command sent to $scope.' : '$label command acknowledged by $okReplies $ackLabel from $scope.'; + state.recordAudit( + kind: 'action', + action: 'worker.control.$rawAction', + status: 'ok', + actor: 'dashboard', + summary: message, + ); final encodedMessage = Uri.encodeComponent(message); final encodedScope = Uri.encodeComponent(scope); return ctx.turboSeeOther( - '/workers?flash=$encodedMessage&scope=$encodedScope', + '$workersPath?flash=$encodedMessage&scope=$encodedScope', ); } on Object catch (error, stack) { - stderr - ..writeln('[stem-dashboard] control command failed: $error') - ..writeln(stack); + stemLogger.error( + 'Dashboard control command failed', + stemLogContext( + component: 'dashboard', + subsystem: 'server', + fields: { + 'error': error.toString(), + 'stack': stack.toString(), + }, + ), + ); + state.recordAudit( + kind: 'action', + action: 'worker.control', + status: 'error', + actor: 'dashboard', + summary: 'Control command failed: $error', + ); return ctx.turboSeeOther( - '/workers?error=${Uri.encodeComponent('Control command failed.')}', + '$workersPath?error=${Uri.encodeComponent('Control command failed.')}', ); } } @@ -392,14 +1244,28 @@ Future _controlWorkers( Future _replayDeadLetters( EngineContext ctx, DashboardDataSource service, -) async { + DashboardState state, { + required String basePath, +}) async { + final redirect = _resolveRedirectPath( + await ctx.defaultPostForm('redirect', dashboardRoute(basePath, '/workers')), + fallbackPath: dashboardRoute(basePath, '/workers'), + ); try { final queue = (await ctx.postForm('queue')).trim(); if (queue.isEmpty) { - final encodedError = Uri.encodeComponent( - 'Queue name is required for replay.', + state.recordAudit( + kind: 'action', + action: 'queue.replay', + status: 'error', + actor: 'dashboard', + summary: 'Replay rejected: missing queue name.', + ); + return ctx.turboSeeOther( + _appendRedirectQuery(redirect, { + 'error': 'Queue name is required for replay.', + }), ); - return ctx.turboSeeOther('/workers?error=$encodedError'); } final limitInput = (await ctx.defaultPostForm('limit', '50')).trim(); final limit = int.tryParse(limitInput)?.clamp(1, 500) ?? 50; @@ -421,10 +1287,20 @@ Future _replayDeadLetters( final message = dryRun ? 'Dry run replay found no dead letters for "$queue".' : 'No dead letters replayed for "$queue".'; - final encodedMessage = Uri.encodeComponent(message); - final encodedScope = Uri.encodeComponent(scope); + state.recordAudit( + kind: 'action', + action: 'queue.replay', + status: 'ok', + actor: 'dashboard', + summary: message, + metadata: {'queue': queue, 'dryRun': dryRun}, + ); return ctx.turboSeeOther( - '/workers?flash=$encodedMessage&scope=$encodedScope', + _appendRedirectQuery(redirect, { + 'flash': message, + 'scope': scope, + if (redirect == '/failures') 'queue': queue, + }), ); } @@ -434,17 +1310,86 @@ Future _replayDeadLetters( ? 'Dry run replay would consider $entryCount dead letter$entrySuffix ' 'for "$queue".' : 'Replayed $entryCount dead letter$entrySuffix for "$queue".'; - final encodedMessage = Uri.encodeComponent(message); - final encodedScope = Uri.encodeComponent(scope); + state.recordAudit( + kind: 'action', + action: 'queue.replay', + status: 'ok', + actor: 'dashboard', + summary: message, + metadata: {'queue': queue, 'entries': entryCount, 'dryRun': dryRun}, + ); return ctx.turboSeeOther( - '/workers?flash=$encodedMessage&scope=$encodedScope', + _appendRedirectQuery(redirect, { + 'flash': message, + 'scope': scope, + if (redirect == '/failures') 'queue': queue, + }), ); } on Object catch (error, stack) { - stderr - ..writeln('[stem-dashboard] DLQ replay failed: $error') - ..writeln(stack); + stemLogger.error( + 'Dashboard dead-letter replay failed', + stemLogContext( + component: 'dashboard', + subsystem: 'server', + fields: { + 'error': error.toString(), + 'stack': stack.toString(), + }, + ), + ); + state.recordAudit( + kind: 'action', + action: 'queue.replay', + status: 'error', + actor: 'dashboard', + summary: 'Dead-letter replay failed: $error', + ); return ctx.turboSeeOther( - '/workers?error=${Uri.encodeComponent('Failed to replay dead letters.')}', + _appendRedirectQuery(redirect, { + 'error': 'Failed to replay dead letters.', + }), ); } } + +String _resolveRedirectPath( + String? raw, { + required String fallbackPath, +}) { + final value = raw?.trim() ?? ''; + if (value.isEmpty || !value.startsWith('/')) { + return fallbackPath; + } + final uri = Uri.tryParse(value); + if (uri == null || uri.host.isNotEmpty || uri.scheme.isNotEmpty) { + return fallbackPath; + } + return value; +} + +String _appendRedirectQuery( + String path, + Map params, +) { + final uri = Uri.parse(path); + final merged = Map.from(uri.queryParameters); + for (final entry in params.entries) { + if (entry.value.trim().isEmpty) continue; + merged[entry.key] = entry.value; + } + final query = merged.entries + .map( + (entry) { + final key = Uri.encodeQueryComponent(entry.key); + final value = Uri.encodeQueryComponent(entry.value); + return '$key=$value'; + }, + ) + .join('&'); + return query.isEmpty ? uri.path : '${uri.path}?$query'; +} + +bool _isTruthy(String value) { + final normalized = value.trim().toLowerCase(); + return normalized == 'true' || normalized == '1' || normalized == 'yes'; +} diff --git a/packages/dashboard/lib/src/services/models.dart b/packages/dashboard/lib/src/services/models.dart index 54730d0f..42681efd 100644 --- a/packages/dashboard/lib/src/services/models.dart +++ b/packages/dashboard/lib/src/services/models.dart @@ -1,4 +1,14 @@ -import 'package:stem/stem.dart' show QueueHeartbeat, WorkerHeartbeat, stemNow; +import 'package:stem/stem.dart' + show + QueueHeartbeat, + RunState, + TaskState, + TaskStatus, + TaskStatusRecord, + WorkerHeartbeat, + WorkflowStatus, + WorkflowStepEntry, + stemNow; /// Aggregate counts for a queue at a point in time. class QueueSummary { @@ -181,6 +191,604 @@ class DashboardEvent { final Map metadata; } +/// Audit log entry for operator actions and automated alerts. +class DashboardAuditEntry { + /// Creates an audit log entry. + const DashboardAuditEntry({ + required this.id, + required this.timestamp, + required this.kind, + required this.action, + required this.status, + this.actor, + this.summary, + this.metadata = const {}, + }); + + /// Stable entry identifier. + final String id; + + /// Event timestamp. + final DateTime timestamp; + + /// Entry kind: `action` or `alert`. + final String kind; + + /// Action/event type identifier. + final String action; + + /// Status marker (`ok`, `error`, `sent`, `skipped`, etc.). + final String status; + + /// Actor identifier where applicable. + final String? actor; + + /// Human-readable summary. + final String? summary; + + /// Optional metadata payload. + final Map metadata; +} + +/// Dashboard-friendly projection of a persisted task status record. +class DashboardTaskStatusEntry { + /// Creates a task status entry. + const DashboardTaskStatusEntry({ + required this.id, + required this.state, + required this.attempt, + required this.createdAt, + required this.updatedAt, + required this.queue, + required this.taskName, + this.errorMessage, + this.errorType, + this.errorStack, + this.payload, + this.meta = const {}, + this.runId, + this.workflowName, + this.workflowStep, + this.workflowStepIndex, + this.workflowIteration, + this.retryable = false, + }); + + /// Builds a dashboard task entry from a [TaskStatusRecord]. + factory DashboardTaskStatusEntry.fromRecord(TaskStatusRecord record) { + final status = record.status; + final meta = status.meta; + final error = status.error; + final queue = _readQueue(meta); + final taskName = _readTaskName(meta); + return DashboardTaskStatusEntry( + id: status.id, + state: status.state, + attempt: status.attempt, + createdAt: record.createdAt, + updatedAt: record.updatedAt, + queue: queue, + taskName: taskName, + errorMessage: error?.message, + errorType: error?.type, + errorStack: error?.stack, + payload: status.payload, + meta: meta, + runId: meta['stem.workflow.runId']?.toString(), + workflowName: meta['stem.workflow.name']?.toString(), + workflowStep: meta['stem.workflow.step']?.toString(), + workflowStepIndex: _readInt(meta['stem.workflow.stepIndex']), + workflowIteration: _readInt(meta['stem.workflow.iteration']), + retryable: error?.retryable ?? false, + ); + } + + /// Builds a dashboard task entry from a plain [TaskStatus]. + /// + /// Use this when the result backend can return the current status but not + /// the persisted record timestamps. + factory DashboardTaskStatusEntry.fromStatus( + TaskStatus status, { + DateTime? observedAt, + }) { + final seenAt = observedAt?.toUtc() ?? stemNow().toUtc(); + final meta = status.meta; + final queue = _readQueue(meta); + final taskName = _readTaskName(meta); + final error = status.error; + return DashboardTaskStatusEntry( + id: status.id, + state: status.state, + attempt: status.attempt, + createdAt: seenAt, + updatedAt: seenAt, + queue: queue, + taskName: taskName, + errorMessage: error?.message, + errorType: error?.type, + errorStack: error?.stack, + payload: status.payload, + meta: meta, + runId: meta['stem.workflow.runId']?.toString(), + workflowName: meta['stem.workflow.name']?.toString(), + workflowStep: meta['stem.workflow.step']?.toString(), + workflowStepIndex: _readInt(meta['stem.workflow.stepIndex']), + workflowIteration: _readInt(meta['stem.workflow.iteration']), + retryable: error?.retryable ?? false, + ); + } + + /// Task identifier. + final String id; + + /// Current lifecycle state. + final TaskState state; + + /// Attempt count for this status. + final int attempt; + + /// Record creation timestamp. + final DateTime createdAt; + + /// Record update timestamp. + final DateTime updatedAt; + + /// Queue associated with the task. + final String queue; + + /// Task handler name if available. + final String taskName; + + /// Failure message when [state] is failed/retried. + final String? errorMessage; + + /// Failure type when [state] is failed/retried. + final String? errorType; + + /// Failure stack trace when captured by the backend. + final String? errorStack; + + /// Persisted task result payload. + final Object? payload; + + /// Raw task metadata from the result backend. + final Map meta; + + /// Workflow run identifier, when this task is part of a workflow. + final String? runId; + + /// Workflow name, when present. + final String? workflowName; + + /// Workflow step name, when present. + final String? workflowStep; + + /// Workflow step index, when present. + final int? workflowStepIndex; + + /// Workflow iteration, when present. + final int? workflowIteration; + + /// Whether the failure is marked retryable. + final bool retryable; + + /// Namespace reported by task metadata, or `stem` when unavailable. + String get namespace => _readNamespace(meta); + + /// Whether this entry represents a workflow task. + bool get isWorkflowTask => + runId != null || + taskName.startsWith('stem.workflow.') || + taskName.contains('workflow'); + + /// Whether this entry is in a failed terminal state. + bool get isFailure => + state == TaskState.failed || state == TaskState.cancelled; + + /// Fingerprint used to group related failures in diagnostics views. + String get errorFingerprint { + final type = (errorType ?? 'Unknown').trim(); + final message = (errorMessage ?? 'No message').trim(); + return '$type: $message'; + } + + /// Task processing start timestamp, when recorded by workers. + DateTime? get startedAt => _readDate(meta['startedAt']); + + /// Task completion/failure timestamp, when recorded by workers. + DateTime? get finishedAt => + _readDate(meta['completedAt']) ?? _readDate(meta['failedAt']); + + /// Estimated queue wait from persisted record creation to processing start. + Duration? get queueWait { + final started = startedAt; + if (started == null) return null; + final value = started.difference(createdAt.toUtc()); + if (value.isNegative) return Duration.zero; + return value; + } + + /// Estimated processing time from start to finish/last update. + Duration? get processingTime { + final started = startedAt; + if (started == null) return null; + final end = finishedAt ?? updatedAt.toUtc(); + final value = end.difference(started); + if (value.isNegative) return Duration.zero; + return value; + } +} + +/// App-focused namespace summary for dashboard observability. +class DashboardNamespaceSnapshot { + /// Creates a namespace summary. + const DashboardNamespaceSnapshot({ + required this.namespace, + required this.queueCount, + required this.workerCount, + required this.pending, + required this.inflight, + required this.deadLetters, + required this.runningTasks, + required this.failedTasks, + required this.workflowRuns, + }); + + /// Namespace identifier. + final String namespace; + + /// Number of distinct queues seen for this namespace. + final int queueCount; + + /// Number of active workers in this namespace. + final int workerCount; + + /// Pending queue depth. + final int pending; + + /// In-flight envelope count. + final int inflight; + + /// Dead-letter count. + final int deadLetters; + + /// Running task statuses. + final int runningTasks; + + /// Failed terminal task statuses. + final int failedTasks; + + /// Distinct workflow run ids observed in task metadata. + final int workflowRuns; +} + +/// Aggregate task summary grouped by task name. +class DashboardJobSummary { + /// Creates a task/job summary. + const DashboardJobSummary({ + required this.taskName, + required this.sampleQueue, + required this.total, + required this.running, + required this.succeeded, + required this.failed, + required this.retried, + required this.cancelled, + required this.lastUpdated, + }); + + /// Task handler name. + final String taskName; + + /// Queue most commonly associated with this task in sampled statuses. + final String sampleQueue; + + /// Total sampled statuses for this task. + final int total; + + /// Running count. + final int running; + + /// Success count. + final int succeeded; + + /// Failure count. + final int failed; + + /// Retried count. + final int retried; + + /// Cancelled count. + final int cancelled; + + /// Most recent update timestamp across sampled statuses. + final DateTime lastUpdated; + + /// Failure ratio in sampled statuses. + double get failureRatio => total <= 0 ? 0 : failed / total; +} + +/// Workflow run summary projected from task status metadata. +class DashboardWorkflowRunSummary { + /// Creates a workflow summary. + const DashboardWorkflowRunSummary({ + required this.runId, + required this.workflowName, + required this.lastStep, + required this.total, + required this.queued, + required this.running, + required this.succeeded, + required this.failed, + required this.cancelled, + required this.lastUpdated, + }); + + /// Workflow run id. + final String runId; + + /// Workflow name, when available. + final String workflowName; + + /// Most recent step marker, when available. + final String? lastStep; + + /// Total sampled statuses for this run. + final int total; + + /// Queued count. + final int queued; + + /// Running count. + final int running; + + /// Succeeded count. + final int succeeded; + + /// Failed count. + final int failed; + + /// Cancelled count. + final int cancelled; + + /// Most recent update timestamp. + final DateTime lastUpdated; +} + +/// Builds app-focused namespace summaries from sampled runtime state. +List buildNamespaceSnapshots({ + required List queues, + required List workers, + required List tasks, + String defaultNamespace = 'stem', +}) { + final queueNamesByNamespace = >{}; + final pendingByNamespace = {}; + final inflightByNamespace = {}; + final deadByNamespace = {}; + final workerCountByNamespace = {}; + final runningByNamespace = {}; + final failedByNamespace = {}; + final runsByNamespace = >{}; + + for (final queue in queues) { + queueNamesByNamespace.putIfAbsent(defaultNamespace, () => {}).add( + queue.queue, + ); + pendingByNamespace[defaultNamespace] = + (pendingByNamespace[defaultNamespace] ?? 0) + queue.pending; + inflightByNamespace[defaultNamespace] = + (inflightByNamespace[defaultNamespace] ?? 0) + queue.inflight; + deadByNamespace[defaultNamespace] = + (deadByNamespace[defaultNamespace] ?? 0) + queue.deadLetters; + } + + for (final worker in workers) { + final namespace = worker.namespace.trim().isEmpty + ? defaultNamespace + : worker.namespace.trim(); + workerCountByNamespace[namespace] = + (workerCountByNamespace[namespace] ?? 0) + 1; + final names = queueNamesByNamespace.putIfAbsent( + namespace, + () => {}, + ); + for (final queue in worker.queues) { + names.add(queue.name); + } + } + + for (final task in tasks) { + final namespace = task.namespace.trim().isEmpty + ? defaultNamespace + : task.namespace.trim(); + queueNamesByNamespace.putIfAbsent(namespace, () => {}).add( + task.queue, + ); + if (task.state == TaskState.running) { + runningByNamespace[namespace] = (runningByNamespace[namespace] ?? 0) + 1; + } + if (task.isFailure) { + failedByNamespace[namespace] = (failedByNamespace[namespace] ?? 0) + 1; + } + if (task.runId != null && task.runId!.isNotEmpty) { + runsByNamespace.putIfAbsent(namespace, () => {}).add(task.runId!); + } + } + + final namespaces = { + ...queueNamesByNamespace.keys, + ...workerCountByNamespace.keys, + ...runningByNamespace.keys, + ...failedByNamespace.keys, + ...runsByNamespace.keys, + }.toList(growable: false) + ..sort(); + + return namespaces.map((namespace) { + return DashboardNamespaceSnapshot( + namespace: namespace, + queueCount: queueNamesByNamespace[namespace]?.length ?? 0, + workerCount: workerCountByNamespace[namespace] ?? 0, + pending: pendingByNamespace[namespace] ?? 0, + inflight: inflightByNamespace[namespace] ?? 0, + deadLetters: deadByNamespace[namespace] ?? 0, + runningTasks: runningByNamespace[namespace] ?? 0, + failedTasks: failedByNamespace[namespace] ?? 0, + workflowRuns: runsByNamespace[namespace]?.length ?? 0, + ); + }).toList(growable: false); +} + +/// Builds task/job summaries grouped by task name. +List buildJobSummaries( + List tasks, { + int limit = 20, +}) { + final buckets = {}; + for (final task in tasks) { + buckets + .putIfAbsent( + task.taskName, + () => _DashboardJobSummaryBuilder(taskName: task.taskName), + ) + .add(task); + } + final results = buckets.values.map((bucket) => bucket.build()).toList() + ..sort((a, b) { + final byTotal = b.total.compareTo(a.total); + if (byTotal != 0) return byTotal; + return b.lastUpdated.compareTo(a.lastUpdated); + }); + final bounded = limit < 1 ? 1 : limit; + return results.take(bounded).toList(growable: false); +} + +/// Builds workflow run summaries grouped by run id. +List buildWorkflowRunSummaries( + List tasks, { + int limit = 20, +}) { + final buckets = {}; + for (final task in tasks) { + final runId = task.runId?.trim(); + if (runId == null || runId.isEmpty) continue; + buckets + .putIfAbsent(runId, () => _DashboardWorkflowSummaryBuilder(runId)) + .add(task); + } + final results = buckets.values.map((bucket) => bucket.build()).toList() + ..sort((a, b) => b.lastUpdated.compareTo(a.lastUpdated)); + final bounded = limit < 1 ? 1 : limit; + return results.take(bounded).toList(growable: false); +} + +/// Projection of a workflow run snapshot for dashboard rendering. +class DashboardWorkflowRunSnapshot { + /// Creates a workflow run snapshot. + const DashboardWorkflowRunSnapshot({ + required this.id, + required this.workflow, + required this.status, + required this.cursor, + required this.createdAt, + this.updatedAt, + this.waitTopic, + this.resumeAt, + this.ownerId, + this.leaseExpiresAt, + this.lastError, + this.result, + }); + + /// Builds a dashboard workflow run snapshot from [RunState]. + factory DashboardWorkflowRunSnapshot.fromRunState(RunState state) { + return DashboardWorkflowRunSnapshot( + id: state.id, + workflow: state.workflow, + status: state.status, + cursor: state.cursor, + createdAt: state.createdAt, + updatedAt: state.updatedAt, + waitTopic: state.waitTopic, + resumeAt: state.resumeAt, + ownerId: state.ownerId, + leaseExpiresAt: state.leaseExpiresAt, + lastError: state.lastError, + result: state.result, + ); + } + + /// Run identifier. + final String id; + + /// Workflow name. + final String workflow; + + /// Current lifecycle state. + final WorkflowStatus status; + + /// Next step cursor. + final int cursor; + + /// Run creation timestamp. + final DateTime createdAt; + + /// Most recent mutation timestamp. + final DateTime? updatedAt; + + /// Topic currently awaited by this run, when suspended. + final String? waitTopic; + + /// Resume deadline for suspended runs. + final DateTime? resumeAt; + + /// Owner of the active lease when running. + final String? ownerId; + + /// Lease expiration if the run is claimed. + final DateTime? leaseExpiresAt; + + /// Last error payload recorded by the workflow runtime. + final Map? lastError; + + /// Final workflow result payload when completed. + final Object? result; +} + +/// Projection of a persisted workflow step checkpoint. +class DashboardWorkflowStepSnapshot { + /// Creates a workflow step snapshot. + const DashboardWorkflowStepSnapshot({ + required this.name, + required this.position, + required this.value, + this.completedAt, + }); + + /// Builds a workflow step snapshot from [WorkflowStepEntry]. + factory DashboardWorkflowStepSnapshot.fromEntry(WorkflowStepEntry entry) { + return DashboardWorkflowStepSnapshot( + name: entry.name, + position: entry.position, + value: entry.value, + completedAt: entry.completedAt, + ); + } + + /// Step name. + final String name; + + /// Step ordering position. + final int position; + + /// Persisted checkpoint value. + final Object? value; + + /// Completion timestamp if available. + final DateTime? completedAt; +} + /// Task request submitted from the dashboard UI. class EnqueueRequest { /// Creates a task enqueue request. @@ -207,3 +815,133 @@ class EnqueueRequest { /// Maximum retry count for the task. final int maxRetries; } + +int? _readInt(Object? value) { + if (value == null) return null; + if (value is int) return value; + if (value is num) return value.toInt(); + return int.tryParse(value.toString()); +} + +DateTime? _readDate(Object? value) { + if (value == null) return null; + if (value is DateTime) return value.toUtc(); + return DateTime.tryParse(value.toString())?.toUtc(); +} + +String _readTaskName(Map meta) { + return meta['task']?.toString() ?? + meta['stem.task']?.toString() ?? + meta['name']?.toString() ?? + meta['taskName']?.toString() ?? + 'unknown'; +} + +String _readQueue(Map meta) { + return meta['queue']?.toString() ?? + meta['stem.queue']?.toString() ?? + 'default'; +} + +String _readNamespace(Map meta) { + return meta['namespace']?.toString() ?? + meta['stem.namespace']?.toString() ?? + 'stem'; +} + +class _DashboardJobSummaryBuilder { + _DashboardJobSummaryBuilder({required this.taskName}); + + final String taskName; + final Map _queueHits = {}; + var _total = 0; + var _running = 0; + var _succeeded = 0; + var _failed = 0; + var _retried = 0; + var _cancelled = 0; + DateTime _lastUpdated = DateTime.fromMillisecondsSinceEpoch(0, isUtc: true); + + void add(DashboardTaskStatusEntry task) { + _total += 1; + _queueHits[task.queue] = (_queueHits[task.queue] ?? 0) + 1; + if (task.state == TaskState.running) _running += 1; + if (task.state == TaskState.succeeded) _succeeded += 1; + if (task.state == TaskState.failed) _failed += 1; + if (task.state == TaskState.retried) _retried += 1; + if (task.state == TaskState.cancelled) _cancelled += 1; + if (task.updatedAt.toUtc().isAfter(_lastUpdated)) { + _lastUpdated = task.updatedAt.toUtc(); + } + } + + DashboardJobSummary build() { + final sampleQueue = _queueHits.entries.isEmpty + ? 'default' + : (_queueHits.entries.toList() + ..sort((a, b) => b.value.compareTo(a.value))) + .first + .key; + return DashboardJobSummary( + taskName: taskName, + sampleQueue: sampleQueue, + total: _total, + running: _running, + succeeded: _succeeded, + failed: _failed, + retried: _retried, + cancelled: _cancelled, + lastUpdated: _lastUpdated, + ); + } +} + +class _DashboardWorkflowSummaryBuilder { + _DashboardWorkflowSummaryBuilder(this.runId); + + final String runId; + String _workflowName = 'workflow'; + String? _lastStep; + var _total = 0; + var _queued = 0; + var _running = 0; + var _succeeded = 0; + var _failed = 0; + var _cancelled = 0; + DateTime _lastUpdated = DateTime.fromMillisecondsSinceEpoch(0, isUtc: true); + + void add(DashboardTaskStatusEntry task) { + _total += 1; + if (task.workflowName != null && task.workflowName!.isNotEmpty) { + _workflowName = task.workflowName!; + } + if (task.workflowStep != null && task.workflowStep!.isNotEmpty) { + _lastStep = task.workflowStep; + } + if (task.state == TaskState.queued || task.state == TaskState.retried) { + _queued += 1; + } + if (task.state == TaskState.running) _running += 1; + if (task.state == TaskState.succeeded) _succeeded += 1; + if (task.state == TaskState.failed) _failed += 1; + if (task.state == TaskState.cancelled) _cancelled += 1; + if (task.updatedAt.toUtc().isAfter(_lastUpdated)) { + _lastUpdated = task.updatedAt.toUtc(); + } + } + + DashboardWorkflowRunSummary build() { + return DashboardWorkflowRunSummary( + runId: runId, + workflowName: _workflowName, + lastStep: _lastStep, + total: _total, + queued: _queued, + running: _running, + succeeded: _succeeded, + failed: _failed, + cancelled: _cancelled, + lastUpdated: _lastUpdated, + ); + } +} diff --git a/packages/dashboard/lib/src/services/stem_service.dart b/packages/dashboard/lib/src/services/stem_service.dart index 4eb872d1..b90ded04 100644 --- a/packages/dashboard/lib/src/services/stem_service.dart +++ b/packages/dashboard/lib/src/services/stem_service.dart @@ -1,10 +1,13 @@ import 'dart:async'; +import 'dart:io'; import 'package:stem/stem.dart'; import 'package:stem_cli/stem_cli.dart'; - import 'package:stem_dashboard/src/config/config.dart'; import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_postgres/stem_postgres.dart'; +import 'package:stem_redis/stem_redis.dart'; +import 'package:stem_sqlite/stem_sqlite.dart'; /// Contract for dashboard services that load queue and worker data. abstract class DashboardDataSource { @@ -14,6 +17,29 @@ abstract class DashboardDataSource { /// Fetches current worker status snapshots. Future> fetchWorkerStatuses(); + /// Fetches persisted task statuses for observability views. + Future> fetchTaskStatuses({ + TaskState? state, + String? queue, + int limit = 100, + int offset = 0, + }); + + /// Fetches a single task status by [taskId]. + Future fetchTaskStatus(String taskId); + + /// Fetches statuses belonging to a workflow [runId]. + Future> fetchTaskStatusesForRun( + String runId, { + int limit = 200, + }); + + /// Fetches persisted workflow run snapshot, if a workflow store is available. + Future fetchWorkflowRun(String runId); + + /// Fetches persisted workflow checkpoints, if a workflow store is available. + Future> fetchWorkflowSteps(String runId); + /// Enqueues a task request through the backing broker. Future enqueueTask(EnqueueRequest request); @@ -24,6 +50,20 @@ abstract class DashboardDataSource { bool dryRun = false, }); + /// Replays a specific dead-letter task by [taskId]. + /// + /// Returns `true` when the entry was found and replayed. + Future replayTaskById(String taskId, {String? queue}); + + /// Requests revocation for [taskId]. + /// + /// Returns `true` when a revoke store is configured and the request is saved. + Future revokeTask( + String taskId, { + bool terminate = false, + String? reason, + }); + /// Sends a control command and returns any replies collected. Future> sendControlCommand( ControlCommandMessage command, { @@ -40,28 +80,57 @@ class StemDashboardService implements DashboardDataSource { required DashboardConfig config, required Broker broker, ResultBackend? backend, + WorkflowStore? workflowStore, + RevokeStore? revokeStore, + Future Function()? disposeContext, + Future<_DashboardRuntimeContext> Function()? reloadRuntimeContext, + bool ownsWorkflowStore = false, }) : _config = config, _namespace = config.namespace, + _signer = PayloadSigner.maybe(config.stem.signing), _broker = broker, - _backend = backend; + _backend = backend, + _workflowStore = workflowStore, + _revokeStore = revokeStore, + _disposeContext = disposeContext, + _reloadRuntimeContext = reloadRuntimeContext, + _ownsWorkflowStore = ownsWorkflowStore; final DashboardConfig _config; final String _namespace; - final Broker _broker; - final ResultBackend? _backend; + final PayloadSigner? _signer; + Broker _broker; + ResultBackend? _backend; + final WorkflowStore? _workflowStore; + RevokeStore? _revokeStore; + Future Function()? _disposeContext; + final Future<_DashboardRuntimeContext> Function()? _reloadRuntimeContext; + Future? _runtimeReconnectFuture; + Future _runtimeOperationQueue = Future.value(); + final bool _ownsWorkflowStore; + var _closed = false; /// Creates a dashboard service using [config]. /// /// Uses [createDefaultContext] to set up broker and backend from environment. static Future connect(DashboardConfig config) async { - final ctx = await createDefaultContext( - environment: Map.from(config.environment), + final runtimeContext = await _createRuntimeContext(config); + + final workflowStore = await _connectWorkflowStore( + config.environment['STEM_WORKFLOW_STORE_URL'], + namespace: _resolveWorkflowNamespace(config), + tls: config.tls, ); return StemDashboardService._( config: config, - broker: ctx.broker, - backend: ctx.backend, + broker: runtimeContext.broker, + backend: runtimeContext.backend, + workflowStore: workflowStore, + revokeStore: runtimeContext.revokeStore, + disposeContext: runtimeContext.dispose, + reloadRuntimeContext: () => _createRuntimeContext(config), + ownsWorkflowStore: true, ); } @@ -73,47 +142,164 @@ class StemDashboardService implements DashboardDataSource { required DashboardConfig config, required Broker broker, ResultBackend? backend, + WorkflowStore? workflowStore, + RevokeStore? revokeStore, }) async { return StemDashboardService._( config: config, broker: broker, backend: backend, + workflowStore: workflowStore, + revokeStore: revokeStore, ); } @override Future> fetchQueueSummaries() async { - final queues = await _discoverQueues(); - final summaries = []; + try { + return await _withRuntimeReconnectRetry(_fetchQueueSummariesImpl); + } on Object catch (error, stack) { + _logReadFailure('fetchQueueSummaries', error, stack); + return const []; + } + } - for (final queue in queues) { - final pending = await _broker.pendingCount(queue) ?? 0; - final inflight = await _broker.inflightCount(queue) ?? 0; - final dead = await _deadLetterCount(queue); + @override + Future> fetchWorkerStatuses() async { + try { + final heartbeats = await _withRuntimeReconnectRetry(() async { + final backend = _backend; + if (backend == null) return const []; + return backend.listWorkerHeartbeats(); + }); + return heartbeats.map(WorkerStatus.fromHeartbeat).toList(growable: false) + ..sort((a, b) => a.workerId.compareTo(b.workerId)); + } on Object catch (error, stack) { + _logReadFailure('fetchWorkerStatuses', error, stack); + return const []; + } + } - summaries.add( - QueueSummary( - queue: queue, - pending: pending, - inflight: inflight, - deadLetters: dead, - ), - ); + @override + Future> fetchTaskStatuses({ + TaskState? state, + String? queue, + int limit = 100, + int offset = 0, + }) async { + final resolvedQueue = queue?.trim(); + final boundedLimit = limit.clamp(1, 500); + final boundedOffset = offset < 0 ? 0 : offset; + try { + final page = await _withRuntimeReconnectRetry(() async { + final backend = _backend; + if (backend == null) { + return const TaskStatusPage(items: []); + } + return backend.listTaskStatuses( + TaskStatusListRequest( + state: state, + queue: resolvedQueue == null || resolvedQueue.isEmpty + ? null + : resolvedQueue, + limit: boundedLimit, + offset: boundedOffset, + ), + ); + }); + return page.items + .map(DashboardTaskStatusEntry.fromRecord) + .toList(growable: false); + } on Object catch (error, stack) { + _logReadFailure('fetchTaskStatuses', error, stack); + return const []; } + } - summaries.sort((a, b) => a.queue.compareTo(b.queue)); - return summaries; + @override + Future fetchTaskStatus(String taskId) async { + final trimmed = taskId.trim(); + if (trimmed.isEmpty) return null; + + try { + final record = await _findTaskStatusRecord(trimmed); + if (record != null) { + return DashboardTaskStatusEntry.fromRecord(record); + } + final backend = _backend; + if (backend == null) return null; + final status = await backend.get(trimmed); + if (status == null) { + return null; + } + return DashboardTaskStatusEntry.fromStatus(status); + } on Object { + return null; + } } @override - Future> fetchWorkerStatuses() async { - final backend = _backend; - if (backend == null) return const []; + Future> fetchTaskStatusesForRun( + String runId, { + int limit = 200, + }) async { + final trimmed = runId.trim(); + if (trimmed.isEmpty) return const []; try { - final heartbeats = await backend.listWorkerHeartbeats(); - return heartbeats.map(WorkerStatus.fromHeartbeat).toList(growable: false) - ..sort((a, b) => a.workerId.compareTo(b.workerId)); + final page = await _withRuntimeReconnectRetry(() async { + final backend = _backend; + if (backend == null) { + return const TaskStatusPage(items: []); + } + return backend.listTaskStatuses( + TaskStatusListRequest( + meta: {'stem.workflow.runId': trimmed}, + limit: limit.clamp(1, 500), + ), + ); + }); + return page.items + .map(DashboardTaskStatusEntry.fromRecord) + .toList(growable: false); + } on Object { + return const []; + } + } + + @override + Future fetchWorkflowRun(String runId) async { + final store = _workflowStore; + if (store == null) return null; + + final trimmed = runId.trim(); + if (trimmed.isEmpty) return null; + + try { + final run = await store.get(trimmed); + if (run == null) return null; + return DashboardWorkflowRunSnapshot.fromRunState(run); + } on Object { + return null; + } + } + + @override + Future> fetchWorkflowSteps( + String runId, + ) async { + final store = _workflowStore; + if (store == null) return const []; + + final trimmed = runId.trim(); + if (trimmed.isEmpty) return const []; + + try { + final steps = await store.listSteps(trimmed); + return steps + .map(DashboardWorkflowStepSnapshot.fromEntry) + .toList(growable: false) + ..sort((a, b) => a.position.compareTo(b.position)); } on Object { return const []; } @@ -130,7 +316,7 @@ class StemDashboardService implements DashboardDataSource { maxRetries: request.maxRetries, meta: const {'source': 'dashboard'}, ); - await _broker.publish(envelope); + await _publishEnvelope(envelope); } @override @@ -140,7 +326,91 @@ class StemDashboardService implements DashboardDataSource { bool dryRun = false, }) async { final bounded = limit.clamp(1, 500); - return _broker.replayDeadLetters(queue, limit: bounded, dryRun: dryRun); + return _withRuntimeReconnectRetry( + () => _broker.replayDeadLetters(queue, limit: bounded, dryRun: dryRun), + ); + } + + @override + Future replayTaskById(String taskId, {String? queue}) async { + final trimmedTask = taskId.trim(); + if (trimmedTask.isEmpty) return false; + + final resolvedQueue = await _resolveReplayQueue(trimmedTask, queue: queue); + if (resolvedQueue == null) { + return false; + } + + final deadLetter = await _withRuntimeReconnectRetry( + () => _broker.getDeadLetter(resolvedQueue, trimmedTask), + ); + if (deadLetter == null) { + return false; + } + + final now = stemNow().toUtc(); + final original = deadLetter.envelope; + final replayMeta = Map.from(original.meta) + ..['source'] = 'dashboard' + ..['dashboard.replayFromTaskId'] = trimmedTask + ..['dashboard.replayedAt'] = now.toIso8601String(); + final replay = original.copyWith( + id: generateEnvelopeId(), + attempt: 0, + enqueuedAt: now, + meta: replayMeta, + ); + await _publishEnvelope(replay); + + final backend = _backend; + if (backend != null) { + final queuedMeta = { + 'queue': replay.queue, + 'task': replay.name, + ...replayMeta, + }; + await backend.set( + replay.id, + TaskState.queued, + attempt: 0, + meta: queuedMeta, + ); + } + return true; + } + + @override + Future revokeTask( + String taskId, { + bool terminate = false, + String? reason, + }) async { + final trimmedTask = taskId.trim(); + if (trimmedTask.isEmpty) return false; + + final now = stemNow().toUtc(); + final trimmedReason = reason?.trim(); + final entry = RevokeEntry( + namespace: _namespace, + taskId: trimmedTask, + version: generateRevokeVersion(), + issuedAt: now, + terminate: terminate, + reason: trimmedReason == null || trimmedReason.isEmpty + ? null + : trimmedReason, + requestedBy: 'dashboard', + ); + try { + final store = _revokeStore; + if (store == null) { + return false; + } + await store.upsertAll([entry]); + return true; + } on Object { + return false; + } } @override @@ -172,7 +442,7 @@ class StemDashboardService implements DashboardDataSource { meta: const {'source': 'dashboard'}, enqueuedAt: now, ); - await _broker.publish(envelope); + await _publishEnvelope(envelope); } final expectedReplies = command.targets.isEmpty @@ -180,14 +450,18 @@ class StemDashboardService implements DashboardDataSource { : command.targets.length; final prefetch = expectedReplies == null ? 8 : expectedReplies.clamp(1, 32); - final subscription = _broker.consume( - RoutingSubscription.singleQueue(replyQueue), - consumerGroup: _controlConsumerGroup, - consumerName: 'dashboard-${command.requestId}', - prefetch: prefetch, + final subscription = await _withRuntimeReconnectRetry>( + () async { + return _broker.consume( + RoutingSubscription.singleQueue(replyQueue), + consumerGroup: _controlConsumerGroup, + consumerName: 'dashboard-${command.requestId}', + prefetch: prefetch, + ); + }, ); - final iterator = StreamIterator(subscription); + final iterator = StreamIterator(subscription); final replies = []; final deadline = stemNow().add(timeout); @@ -210,9 +484,11 @@ class StemDashboardService implements DashboardDataSource { try { final reply = controlReplyFromEnvelope(delivery.envelope); replies.add(reply); - await _broker.ack(delivery); + await _withRuntimeReconnectRetry(() => _broker.ack(delivery)); } on Object { - await _broker.nack(delivery, requeue: false); + await _withRuntimeReconnectRetry( + () => _broker.nack(delivery, requeue: false), + ); } if (expectedReplies != null && replies.length >= expectedReplies) { @@ -229,9 +505,155 @@ class StemDashboardService implements DashboardDataSource { @override Future close() async { - // Note: The broker and backend will be closed when the context is disposed. - // Since we got them from createDefaultContext, we don't own their - // lifecycle. + if (_closed) return; + _closed = true; + + if (_ownsWorkflowStore) { + await _disposeWorkflowStore(_workflowStore); + } + + await _disposeRuntimeContext(); + } + + Future> _fetchQueueSummariesImpl() async { + final queues = await _discoverQueues(); + final summaries = []; + + for (final queue in queues) { + final pending = await _broker.pendingCount(queue) ?? 0; + final inflight = await _broker.inflightCount(queue) ?? 0; + final dead = await _deadLetterCount(queue); + + summaries.add( + QueueSummary( + queue: queue, + pending: pending, + inflight: inflight, + deadLetters: dead, + ), + ); + } + + summaries.sort((a, b) => a.queue.compareTo(b.queue)); + return summaries; + } + + Future _withRuntimeReconnectRetry(Future Function() action) { + return _serializeRuntimeAccess(() async { + try { + return await action(); + } on Object catch (error) { + final recovered = await _recoverRuntimeContextIfNeeded(error); + if (!recovered) { + rethrow; + } + return action(); + } + }); + } + + Future _recoverRuntimeContextIfNeeded(Object error) async { + if (_closed || !_isRecoverableConnectionError(error)) { + return false; + } + + final reloadRuntimeContext = _reloadRuntimeContext; + if (reloadRuntimeContext == null) { + return false; + } + + try { + await _reconnectRuntimeContext(reloadRuntimeContext); + return true; + } on Object { + return false; + } + } + + bool _isRecoverableConnectionError(Object error) { + if (error is SocketException || + error is IOException || + error is StateError) { + return true; + } + final message = '$error'.toLowerCase(); + return message.contains('streamsink is closed') || + message.contains('stream is closed') || + message.contains('connection closed') || + message.contains('not connected') || + message.contains('connection refused') || + message.contains('socket is closed') || + message.contains('broken pipe') || + message.contains('timed out') || + message.contains('connection reset'); + } + + Future _serializeRuntimeAccess(Future Function() action) { + final completer = Completer(); + _runtimeOperationQueue = _runtimeOperationQueue.catchError((_) {}).then(( + _, + ) async { + try { + completer.complete(await action()); + } on Object catch (error, stack) { + completer.completeError(error, stack); + } + }); + return completer.future; + } + + void _logReadFailure(String operation, Object error, StackTrace stack) { + stemLogger.warning( + 'Dashboard data read failed', + stemLogContext( + component: 'dashboard', + subsystem: 'service', + fields: { + 'operation': operation, + 'error': '$error', + 'stack': '$stack', + }, + ), + ); + } + + Future _reconnectRuntimeContext( + Future<_DashboardRuntimeContext> Function() reloadRuntimeContext, + ) async { + if (_runtimeReconnectFuture != null) { + return _runtimeReconnectFuture!; + } + final completer = Completer(); + _runtimeReconnectFuture = completer.future; + try { + final nextContext = await reloadRuntimeContext(); + final previousDispose = _disposeContext; + _broker = nextContext.broker; + _backend = nextContext.backend; + _revokeStore = nextContext.revokeStore; + _disposeContext = nextContext.dispose; + if (previousDispose != null) { + try { + await previousDispose(); + } on Object { + // Ignore disposal failures while recovering from connection errors. + } + } + completer.complete(); + } on Object catch (error, stack) { + completer.completeError(error, stack); + rethrow; + } finally { + _runtimeReconnectFuture = null; + } + } + + Future _disposeRuntimeContext() async { + final disposeContext = _disposeContext; + _disposeContext = null; + if (disposeContext != null) { + await disposeContext(); + } } Future> _discoverQueues() async { @@ -294,11 +716,159 @@ class StemDashboardService implements DashboardDataSource { Future _purgeQueue(String queue) async { try { - await _broker.purge(queue); + await _withRuntimeReconnectRetry(() => _broker.purge(queue)); } on Object { // Some brokers may not support purge; ignore failures. } } + Future _publishEnvelope(Envelope envelope) async { + final signer = _signer; + final payload = signer == null ? envelope : await signer.sign(envelope); + await _withRuntimeReconnectRetry(() => _broker.publish(payload)); + } + + Future _findTaskStatusRecord(String taskId) async { + final backend = _backend; + if (backend == null) return null; + + var offset = 0; + const pageSize = 200; + const maxPages = 10; + + for (var pageIndex = 0; pageIndex < maxPages; pageIndex++) { + final page = await backend.listTaskStatuses( + TaskStatusListRequest(limit: pageSize, offset: offset), + ); + for (final item in page.items) { + if (item.status.id == taskId) { + return item; + } + } + final nextOffset = page.nextOffset; + if (nextOffset == null) { + break; + } + offset = nextOffset; + } + return null; + } + + Future _resolveReplayQueue( + String taskId, { + String? queue, + }) async { + final explicit = queue?.trim(); + if (explicit != null && explicit.isNotEmpty) { + return explicit; + } + + final status = await fetchTaskStatus(taskId); + final statusQueue = status?.queue.trim(); + if (statusQueue != null && statusQueue.isNotEmpty) { + return statusQueue; + } + + final queues = await _discoverQueues(); + for (final candidate in queues) { + final entry = await _broker.getDeadLetter(candidate, taskId); + if (entry != null) { + return candidate; + } + } + return null; + } + + static String _resolveWorkflowNamespace(DashboardConfig config) { + final raw = config.environment['STEM_WORKFLOW_NAMESPACE']?.trim(); + if (raw != null && raw.isNotEmpty) { + return raw; + } + return config.namespace; + } + + static Future _connectWorkflowStore( + String? url, { + required String namespace, + required TlsConfig tls, + }) async { + final trimmed = url?.trim(); + if (trimmed == null || trimmed.isEmpty) { + return null; + } + + final uri = Uri.parse(trimmed); + switch (uri.scheme) { + case 'redis': + case 'rediss': + return RedisWorkflowStore.connect( + trimmed, + namespace: namespace, + tls: tls, + ); + case 'postgres': + case 'postgresql': + case 'postgresql+ssl': + case 'postgres+ssl': + return PostgresWorkflowStore.connect( + trimmed, + namespace: namespace, + applicationName: 'stem-dashboard-workflow', + tls: tls, + ); + case 'sqlite': + final path = uri.path.isNotEmpty ? uri.path : 'workflow.sqlite'; + return SqliteWorkflowStore.open(File(path)); + case 'file': + return SqliteWorkflowStore.open(File(uri.toFilePath())); + case 'memory': + return InMemoryWorkflowStore(); + default: + return null; + } + } + + static Future _disposeWorkflowStore(WorkflowStore? store) async { + if (store is RedisWorkflowStore) { + await store.close(); + return; + } + if (store is PostgresWorkflowStore) { + await store.close(); + return; + } + if (store is SqliteWorkflowStore) { + await store.close(); + } + } + + static Future<_DashboardRuntimeContext> _createRuntimeContext( + DashboardConfig config, + ) async { + final context = await createDefaultContext( + environment: Map.from(config.environment), + ); + return _DashboardRuntimeContext( + broker: context.broker, + backend: context.backend, + revokeStore: context.revokeStore, + dispose: context.dispose, + ); + } + static const _controlConsumerGroup = 'stem-dashboard-control'; } + +class _DashboardRuntimeContext { + const _DashboardRuntimeContext({ + required this.broker, + required this.backend, + required this.revokeStore, + required this.dispose, + }); + + final Broker broker; + final ResultBackend? backend; + final RevokeStore? revokeStore; + final Future Function() dispose; +} diff --git a/packages/dashboard/lib/src/state/dashboard_state.dart b/packages/dashboard/lib/src/state/dashboard_state.dart index 1b4e193a..0dd8696d 100644 --- a/packages/dashboard/lib/src/state/dashboard_state.dart +++ b/packages/dashboard/lib/src/state/dashboard_state.dart @@ -1,8 +1,10 @@ import 'dart:async'; +import 'dart:convert'; +import 'dart:io'; import 'package:meta/meta.dart'; import 'package:routed_hotwire/routed_hotwire.dart'; -import 'package:stem/stem.dart' show stemNow; +import 'package:stem/stem.dart' show TaskState, stemNow; import 'package:stem_dashboard/src/services/models.dart'; import 'package:stem_dashboard/src/services/stem_service.dart'; import 'package:stem_dashboard/src/ui/event_templates.dart'; @@ -14,8 +16,16 @@ class DashboardState { required this.service, this.pollInterval = const Duration(seconds: 5), this.eventLimit = 200, + this.auditLimit = 300, + this.alertWebhookUrls = const [], + this.alertBacklogThreshold = 500, + this.alertFailedTaskThreshold = 25, + this.alertOfflineWorkerThreshold = 1, + this.alertCooldown = const Duration(minutes: 5), }) : hub = TurboStreamHub(); + static const _alertWebhookTimeout = Duration(seconds: 5); + /// Data source used to fetch queues and workers. final DashboardDataSource service; @@ -28,10 +38,34 @@ class DashboardState { /// Maximum number of events retained in memory. final int eventLimit; + /// Maximum number of audit entries retained in memory. + final int auditLimit; + + /// Webhook URLs used for alert delivery. + final List alertWebhookUrls; + + /// Backlog threshold triggering an alert. + final int alertBacklogThreshold; + + /// Failed-task threshold triggering an alert. + final int alertFailedTaskThreshold; + + /// Offline-worker threshold triggering an alert. + final int alertOfflineWorkerThreshold; + + /// Minimum duration between repeated alerts of the same type. + final Duration alertCooldown; + Timer? _timer; List _previousQueues = const []; Map _previousWorkers = const {}; + String _previousQueueSignature = ''; + String _previousWorkerSignature = ''; + String _previousTaskSignature = ''; + var _hasPrimedRefresh = false; final _events = []; + final _auditEntries = []; + final _lastAlertAt = {}; Future _polling = Future.value(); DateTime? _lastPollAt; DashboardThroughput _throughput = const DashboardThroughput( @@ -46,6 +80,10 @@ class DashboardState { /// Most recent throughput calculation. DashboardThroughput get throughput => _throughput; + /// Recent audit entries in reverse chronological order. + List get auditEntries => + List.unmodifiable(_auditEntries); + /// Starts the polling loop and emits initial state. Future start() async { await _runPoll(); @@ -67,14 +105,36 @@ class DashboardState { Future runOnce() => _poll(); Future _poll() async { - final queues = await service.fetchQueueSummaries(); - final workers = await service.fetchWorkerStatuses(); + final results = await Future.wait([ + service.fetchQueueSummaries(), + service.fetchWorkerStatuses(), + service.fetchTaskStatuses(limit: 120), + ]); + final queues = results[0] as List; + final workers = results[1] as List; + final tasks = results[2] as List; _updateThroughput(queues); _generateQueueEvents(_previousQueues, queues); _generateWorkerEvents(_previousWorkers, { for (final worker in workers) worker.workerId: worker, }); + await _evaluateAlerts(queues: queues, workers: workers, tasks: tasks); + + final queueSignature = _queueSignature(queues); + final workerSignature = _workerSignature(workers); + final taskSignature = _taskSignature(tasks); + final changed = + queueSignature != _previousQueueSignature || + workerSignature != _previousWorkerSignature || + taskSignature != _previousTaskSignature; + if (_hasPrimedRefresh && changed) { + _broadcastRefreshSignal(); + } + _hasPrimedRefresh = true; + _previousQueueSignature = queueSignature; + _previousWorkerSignature = workerSignature; + _previousTaskSignature = taskSignature; _previousQueues = queues; _previousWorkers = {for (final worker in workers) worker.workerId: worker}; @@ -256,4 +316,265 @@ class DashboardState { if (delta < 0) return 'decreased by ${delta.abs()}'; return 'unchanged'; } + + String _queueSignature(List queues) { + final sorted = List.from(queues) + ..sort((a, b) => a.queue.compareTo(b.queue)); + return sorted + .map( + (queue) { + return '${queue.queue}:${queue.pending}:' + '${queue.inflight}:${queue.deadLetters}'; + }, + ) + .join('|'); + } + + String _workerSignature(List workers) { + final sorted = List.from(workers) + ..sort((a, b) => a.workerId.compareTo(b.workerId)); + return sorted + .map( + (worker) { + final stamp = worker.timestamp.toUtc().toIso8601String(); + return '${worker.workerId}:${worker.inflight}:$stamp'; + }, + ) + .join('|'); + } + + String _taskSignature(List tasks) { + return tasks + .map( + (task) { + final stamp = task.updatedAt.toUtc().toIso8601String(); + return '${task.id}:${task.state.name}:${task.attempt}:$stamp'; + }, + ) + .join('|'); + } + + void _broadcastRefreshSignal() { + final payload = turboStreamReplace( + target: 'dashboard-refresh-signal', + html: '${stemNow().toUtc().toIso8601String()}', + ); + hub.broadcast('stem-dashboard:refresh', [payload]); + } + + /// Records an audit entry. + void recordAudit({ + required String kind, + required String action, + required String status, + String? actor, + String? summary, + Map metadata = const {}, + }) { + final entry = DashboardAuditEntry( + id: 'audit-${stemNow().toUtc().microsecondsSinceEpoch}', + timestamp: stemNow().toUtc(), + kind: kind, + action: action, + status: status, + actor: actor, + summary: summary, + metadata: metadata, + ); + _auditEntries.insert(0, entry); + if (_auditEntries.length > auditLimit) { + _auditEntries.removeRange(auditLimit, _auditEntries.length); + } + _broadcastRefreshSignal(); + } + + Future _evaluateAlerts({ + required List queues, + required List workers, + required List tasks, + }) async { + final totalPending = queues.fold( + 0, + (total, queue) => total + queue.pending, + ); + if (totalPending >= alertBacklogThreshold) { + await _emitAlert( + key: 'queue.backlog.high', + summary: + 'Backlog threshold exceeded: ' + '$totalPending >= $alertBacklogThreshold.', + metadata: { + 'pendingTotal': totalPending, + 'threshold': alertBacklogThreshold, + }, + ); + } + + final failedCount = tasks.where((task) { + return task.state == TaskState.failed || + task.state == TaskState.cancelled; + }).length; + if (failedCount >= alertFailedTaskThreshold) { + await _emitAlert( + key: 'tasks.failed.high', + summary: + 'Failed task threshold exceeded: ' + '$failedCount >= $alertFailedTaskThreshold.', + metadata: { + 'failedCount': failedCount, + 'threshold': alertFailedTaskThreshold, + }, + ); + } + + final offlineWorkers = workers.where( + (worker) => worker.age > const Duration(minutes: 2), + ); + if (offlineWorkers.length >= alertOfflineWorkerThreshold) { + await _emitAlert( + key: 'workers.offline.high', + summary: + 'Offline workers threshold exceeded: ${offlineWorkers.length} >= ' + '$alertOfflineWorkerThreshold.', + metadata: { + 'offlineWorkers': offlineWorkers + .map((worker) => worker.workerId) + .toList( + growable: false, + ), + 'threshold': alertOfflineWorkerThreshold, + }, + ); + } + } + + Future _emitAlert({ + required String key, + required String summary, + Map metadata = const {}, + }) async { + final now = stemNow().toUtc(); + final last = _lastAlertAt[key]; + if (last != null && now.difference(last) < alertCooldown) { + return; + } + _lastAlertAt[key] = now; + + recordAudit( + kind: 'alert', + action: key, + status: 'triggered', + actor: 'system', + summary: summary, + metadata: metadata, + ); + _recordEvent( + DashboardEvent( + title: 'Alert: $key', + timestamp: now, + summary: summary, + metadata: metadata, + ), + ); + + if (alertWebhookUrls.isEmpty) { + recordAudit( + kind: 'alert', + action: key, + status: 'skipped', + actor: 'system', + summary: 'No alert webhook URLs configured.', + ); + return; + } + await _sendAlertWebhooks(key: key, summary: summary, metadata: metadata); + } + + Future _sendAlertWebhooks({ + required String key, + required String summary, + required Map metadata, + }) async { + final payload = { + 'kind': 'stem-dashboard-alert', + 'key': key, + 'summary': summary, + 'timestamp': stemNow().toUtc().toIso8601String(), + 'metadata': metadata, + }; + + for (final rawUrl in alertWebhookUrls) { + final url = rawUrl.trim(); + if (url.isEmpty) continue; + final uri = Uri.tryParse(url); + if (uri == null || !uri.hasScheme || uri.host.isEmpty) { + recordAudit( + kind: 'alert', + action: key, + status: 'error', + actor: 'system', + summary: 'Invalid webhook URL: $url', + ); + continue; + } + + final client = HttpClient()..connectionTimeout = _alertWebhookTimeout; + HttpClientRequest? request; + var shouldAbortRequest = false; + try { + request = await client.postUrl(uri).timeout(_alertWebhookTimeout); + shouldAbortRequest = true; + request.headers.contentType = ContentType.json; + request.add(utf8.encode(jsonEncode(payload))); + final response = await request.close().timeout(_alertWebhookTimeout); + shouldAbortRequest = false; + try { + await response.drain().timeout(_alertWebhookTimeout); + } on Object { + // Response body is optional for webhook auditing. + } + if (response.statusCode >= 200 && response.statusCode < 300) { + recordAudit( + kind: 'alert', + action: key, + status: 'sent', + actor: 'system', + summary: 'Alert delivered to $url.', + ); + } else { + recordAudit( + kind: 'alert', + action: key, + status: 'error', + actor: 'system', + summary: 'Webhook returned HTTP ${response.statusCode} for $url.', + ); + } + } on TimeoutException { + if (shouldAbortRequest) { + request?.abort(); + } + recordAudit( + kind: 'alert', + action: key, + status: 'error', + actor: 'system', + summary: 'Webhook delivery timed out for $url.', + ); + } on Object catch (error) { + if (shouldAbortRequest) { + request?.abort(); + } + recordAudit( + kind: 'alert', + action: key, + status: 'error', + actor: 'system', + summary: 'Webhook delivery failed for $url: $error', + ); + } finally { + client.close(force: true); + } + } + } } diff --git a/packages/dashboard/lib/src/ui/audit.dart b/packages/dashboard/lib/src/ui/audit.dart new file mode 100644 index 00000000..daddb6f5 --- /dev/null +++ b/packages/dashboard/lib/src/ui/audit.dart @@ -0,0 +1,76 @@ +// Public helper functions in this file are intentionally undocumented to keep +// UI template files lightweight. +// ignore_for_file: public_member_api_docs + +import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/shared.dart'; + +String buildAuditContent(List entries) { + final actions = entries.where((entry) => entry.kind == 'action').length; + final alerts = entries.where((entry) => entry.kind == 'alert').length; + final failures = entries.where((entry) => entry.status == 'error').length; + + return ''' + + +
+ ${buildMetricCard('Entries', formatInt(entries.length), 'Recent audit records retained in dashboard memory.')} + ${buildMetricCard('Actions', formatInt(actions), 'Control/replay/revoke operations initiated by operators.')} + ${buildMetricCard('Alerts', formatInt(alerts), 'Automated threshold alerts emitted by polling logic.')} + ${buildMetricCard('Errors', formatInt(failures), 'Entries with status=error requiring follow-up.')} +
+ +
+
+

Timeline

+
+ + + + + + + + + + + + + + ${entries.isEmpty ? ''' + + + +''' : entries.take(250).map((entry) => ''' + + + + + + + + + +''').join()} + +
TimeKindActionStatusActorSummaryMetadata
No audit entries yet.
${formatRelative(entry.timestamp)}${escapeHtml(entry.kind)}${escapeHtml(entry.action)}${escapeHtml(entry.status)}${escapeHtml(entry.actor ?? 'system')}${escapeHtml(entry.summary ?? '—')}${escapeHtml(_formatMetadata(entry.metadata))}
+
+'''; +} + +String _formatMetadata(Map metadata) { + if (metadata.isEmpty) return '—'; + final values = metadata.entries + .map((entry) => '${entry.key}=${entry.value}') + .toList(growable: false); + return values.join(', '); +} diff --git a/packages/dashboard/lib/src/ui/content.dart b/packages/dashboard/lib/src/ui/content.dart index b5f3ba7e..b88cc745 100644 --- a/packages/dashboard/lib/src/ui/content.dart +++ b/packages/dashboard/lib/src/ui/content.dart @@ -1,594 +1,95 @@ -import 'package:intl/intl.dart'; -import 'package:stem/stem.dart' show stemNow; -// HTML template strings are kept on single lines for readability. -// ignore_for_file: lines_longer_than_80_chars - import 'package:stem_dashboard/src/services/models.dart'; -import 'package:stem_dashboard/src/ui/event_templates.dart'; +import 'package:stem_dashboard/src/ui/audit.dart'; +import 'package:stem_dashboard/src/ui/events.dart'; +import 'package:stem_dashboard/src/ui/failures.dart'; +import 'package:stem_dashboard/src/ui/jobs.dart'; import 'package:stem_dashboard/src/ui/layout.dart'; +import 'package:stem_dashboard/src/ui/namespaces.dart'; +import 'package:stem_dashboard/src/ui/options.dart'; +import 'package:stem_dashboard/src/ui/overview.dart'; +import 'package:stem_dashboard/src/ui/search.dart'; +import 'package:stem_dashboard/src/ui/task_detail.dart'; +import 'package:stem_dashboard/src/ui/tasks.dart'; +import 'package:stem_dashboard/src/ui/workers.dart'; +import 'package:stem_dashboard/src/ui/workflows.dart'; -final _numberFormat = NumberFormat.decimalPattern(); - -/// View options used by the tasks page renderer. -class TasksPageOptions { - /// Creates task page options with optional overrides. - const TasksPageOptions({ - this.sortKey = 'queue', - this.descending = false, - this.filter, - this.flashKey, - this.errorKey, - }); - - /// Sort key used for queue ordering. - final String sortKey; - - /// Whether sorting should be descending. - final bool descending; - - /// Optional queue filter text. - final String? filter; - - /// Optional flash message key for UI alerts. - final String? flashKey; - - /// Optional error message key for UI alerts. - final String? errorKey; - - /// Whether a non-empty filter value is set. - bool get hasFilter => filter != null && filter!.isNotEmpty; -} - -/// View options used by the workers page renderer. -class WorkersPageOptions { - /// Creates worker page options with optional overrides. - const WorkersPageOptions({this.flashMessage, this.errorMessage, this.scope}); - - /// Optional flash message for the UI. - final String? flashMessage; - - /// Optional error message for the UI. - final String? errorMessage; - - /// Optional worker scope filter. - final String? scope; - - /// Whether a non-empty flash message is set. - bool get hasFlash => flashMessage != null && flashMessage!.isNotEmpty; - - /// Whether a non-empty error message is set. - bool get hasError => errorMessage != null && errorMessage!.isNotEmpty; - - /// Whether a non-empty scope value is set. - bool get hasScope => scope != null && scope!.isNotEmpty; -} +export 'package:stem_dashboard/src/ui/options.dart'; /// Builds the HTML for the specified dashboard [page]. String buildPageContent({ required DashboardPage page, required List queues, required List workers, + List taskStatuses = const [], + DashboardTaskStatusEntry? taskDetail, + List runTimeline = const [], + DashboardWorkflowRunSnapshot? workflowRun, + List workflowSteps = const [], + List auditEntries = const [], DashboardThroughput? throughput, List events = const [], + String defaultNamespace = 'stem', TasksPageOptions tasksOptions = const TasksPageOptions(), WorkersPageOptions workersOptions = const WorkersPageOptions(), + FailuresPageOptions failuresOptions = const FailuresPageOptions(), + SearchPageOptions searchOptions = const SearchPageOptions(), + NamespacesPageOptions namespacesOptions = const NamespacesPageOptions(), + WorkflowsPageOptions workflowsOptions = const WorkflowsPageOptions(), + JobsPageOptions jobsOptions = const JobsPageOptions(), }) { switch (page) { case DashboardPage.overview: - return _overviewContent(queues, workers, throughput); + return buildOverviewContent( + queues, + workers, + throughput, + taskStatuses, + defaultNamespace, + ); case DashboardPage.tasks: - return _tasksContent(queues, tasksOptions); + return buildTasksContent(queues, tasksOptions, taskStatuses); + case DashboardPage.taskDetail: + return buildTaskDetailContent( + taskDetail, + runTimeline, + workflowRun, + workflowSteps, + ); + case DashboardPage.failures: + return buildFailuresContent(taskStatuses, failuresOptions); + case DashboardPage.search: + return buildSearchContent( + options: searchOptions, + queues: queues, + workers: workers, + taskStatuses: taskStatuses, + auditEntries: auditEntries, + ); + case DashboardPage.audit: + return buildAuditContent(auditEntries); case DashboardPage.events: - return _eventsContent(events); + return buildEventsContent(events); case DashboardPage.workers: - return _workersContent(workers, queues, workersOptions); - } -} - -String _overviewContent( - List queues, - List workers, - DashboardThroughput? throughput, -) { - final totalPending = queues.fold( - 0, - (total, summary) => total + summary.pending, - ); - final totalInflight = queues.fold( - 0, - (total, summary) => total + summary.inflight, - ); - final totalDead = queues.fold( - 0, - (total, summary) => total + summary.deadLetters, - ); - final activeWorkers = workers.length; - final busiest = List.of( - queues, - )..sort((a, b) => (b.pending + b.inflight).compareTo(a.pending + a.inflight)); - final topQueues = busiest.take(5).toList(); - - final processedPerMin = throughput?.processedPerMinute ?? 0; - final enqueuedPerMin = throughput?.enqueuedPerMinute ?? 0; - final throughputHint = throughput == null - ? 'Waiting for another snapshot to estimate rate.' - : 'Net change over the last ${throughput.interval.inSeconds}s.'; - - return ''' - - -
- ${_metricCard('Backlog (lag)', _formatInt(totalPending), 'Undelivered tasks waiting across all queues.')} - ${_metricCard('Processing', _formatInt(totalInflight), 'Active envelopes currently being executed.')} - ${_metricCard('Processed / min', _formatRate(processedPerMin), throughputHint)} - ${_metricCard('Enqueued / min', _formatRate(enqueuedPerMin), throughputHint)} - ${_metricCard('Dead letters', _formatInt(totalDead), 'Items held in dead letter queues.')} - ${_metricCard('Active workers', _formatInt(activeWorkers), 'Workers that published heartbeats within the retention window.')} -
- -
- - - - - - - - - - - ${topQueues.isEmpty ? _emptyQueuesRow('No queues detected yet.') : topQueues.map(_queueTableRow).join()} - -
QueuePendingIn-flightDead letters
-
-'''; -} - -String _tasksContent(List queues, TasksPageOptions options) { - var filtered = - options.hasFilter - ? queues - .where( - (summary) => summary.queue.toLowerCase().contains( - options.filter!.toLowerCase(), - ), - ) - .toList() - : List.of(queues) - ..sort((a, b) => _compareQueues(a, b, options)); - if (options.descending) { - filtered = filtered.reversed.toList(); - } - - final totalQueues = filtered.length; - final dlqTotal = filtered.fold( - 0, - (total, summary) => total + summary.deadLetters, - ); - - return ''' - - -${_renderTasksAlert(options)} - -
- ${_metricCard('Tracked queues', _formatInt(totalQueues), 'Queues discovered via Redis stream prefixes.')} - ${_metricCard('Dead letter size', _formatInt(dlqTotal), 'Aggregate items across all dead letter queues.')} -
- -
- - - - - - ${options.hasFilter ? 'Clear' : ''} -
- -
- - - - - - - - - - - ${filtered.isEmpty ? _emptyQueuesRow('No streams found for the configured namespace.') : filtered.map(_queueTableRow).join()} - -
${_sortableHeader('Queue', 'queue', options)}${_sortableHeader('Pending', 'pending', options)}${_sortableHeader('In-flight', 'inflight', options)}${_sortableHeader('Dead letters', 'dead', options)}
-
- -
-
-

Ad-hoc enqueue

-
-
- - - - - -
- -
-
-
-'''; -} - -String _formatRate(double value) { - if (value <= 0) return '0'; - if (value < 1) return value.toStringAsFixed(2); - return _numberFormat.format(value.round()); -} - -String _eventsContent(List events) { - final items = events.isEmpty - ? ''' -
-

No events captured yet

-

- Configure the dashboard event bridge to stream Stem signals (enqueue, start, retry, completion) into Redis. - Once connected, updates will appear here automatically via Turbo Streams. -

-
- ''' - : events.map(renderEventItem).join(); - - return ''' - - -
- $items -
-'''; -} - -String _workersContent( - List workers, - List queues, - WorkersPageOptions options, -) { - final healthyWorkers = workers.where((worker) { - return worker.age <= const Duration(minutes: 2); - }).length; - - final busy = workers.where((worker) => worker.inflight > 0).length; - final queueMap = {for (final summary in queues) summary.queue: summary}; - - return ''' - - -${_renderWorkersAlert(options)} - -
- ${_metricCard('Healthy workers', _formatInt(healthyWorkers), 'Heartbeats received within the last two minutes.')} - ${_metricCard('Busy workers', _formatInt(busy), 'Workers currently processing at least one task.')} - ${_metricCard('Isolates in use', _formatInt(_totalIsolates(workers)), 'Sum of worker isolates across the cluster.')} -
- -
- - - - - - - - - - - - ${workers.isEmpty ? ''' - - - - ''' : workers.map(_workerRow).join()} - -
WorkerQueuesInflightLast heartbeatActions
No heartbeats detected for namespace "${workers.isEmpty ? 'stem' : workers.first.namespace}".
-
- -${_clusterControls()} - -${_queueRecoverySection(queueMap)} -'''; -} - -String _queueTableRow(QueueSummary summary) { - return ''' - - ${summary.queue} - ${_formatInt(summary.pending)} - ${_formatInt(summary.inflight)} - ${_formatInt(summary.deadLetters)} - - - -
-
Pending ${_formatInt(summary.pending)}
-
In-flight ${_formatInt(summary.inflight)}
-
Dead letters ${_formatInt(summary.deadLetters)}
-
Detailed DLQ previews render here once the replay control is wired.
-
- - -'''; -} - -String _workerRow(WorkerStatus status) { - final queues = status.queues.isEmpty - ? '—' - : status.queues - .map((queue) => '${queue.name}') - .join(' '); - return ''' - - ${status.workerId} - $queues - ${_formatInt(status.inflight)} - ${_formatRelative(status.timestamp)} - -
- ${_workerActionButton('Ping', 'ping', status.workerId)} - ${_workerActionButton('Pause', 'pause', status.workerId)} - ${_workerActionButton('Shutdown', 'shutdown', status.workerId)} -
- - -'''; -} - -String _workerActionButton(String label, String action, String workerId) { - return ''' -
- - - -
-'''; -} - -String _clusterControls() { - return ''' -
-

Cluster controls

-
- ${_clusterActionButton('Ping all workers', 'ping')} - ${_clusterActionButton('Pause all workers', 'pause')} - ${_clusterActionButton('Shutdown all workers', 'shutdown')} -
-
-'''; -} - -String _clusterActionButton(String label, String action) { - return ''' -
- - - -
-'''; -} - -String _queueRecoverySection(Map queues) { - if (queues.isEmpty) return ''; - final rows = queues.values.toList() - ..sort((a, b) => a.queue.compareTo(b.queue)); - return ''' -
- - - - - - - - - - - ${rows.map(_queueRecoveryRow).join()} - -
QueuePendingDead lettersReplay
-
-'''; -} - -String _queueRecoveryRow(QueueSummary summary) { - final limitDefault = summary.deadLetters <= 0 - ? 50 - : summary.deadLetters.clamp(1, 50); - final action = summary.deadLetters == 0 - ? 'No dead letters' - : ''' -
- - - -
- '''; - return ''' - - ${_escapeHtml(summary.queue)} - ${_formatInt(summary.pending)} - ${_formatInt(summary.deadLetters)} - $action - -'''; -} - -String _metricCard(String title, String value, String caption) { - return ''' -
-
$title
-
$value
-

$caption

-
-'''; -} - -String _emptyQueuesRow(String message) { - return ''' - - $message - -'''; -} - -String _renderTasksAlert(TasksPageOptions options) { - String? message; - var type = 'success'; - switch (options.flashKey) { - case 'queued': - message = 'Task enqueued successfully.'; - } - switch (options.errorKey) { - case 'missing-fields': - message = 'Queue and task name are required.'; - type = 'error'; - case 'invalid-payload': - message = 'Payload must be valid JSON describing an object.'; - type = 'error'; - case 'enqueue-failed': - message = - 'Failed to enqueue the task. Check the dashboard logs for details.'; - type = 'error'; - } - - if (message == null) return ''; - return '
${_escapeHtml(message)}
'; -} - -String _renderWorkersAlert(WorkersPageOptions options) { - if (options.hasError) { - final scope = options.hasScope - ? '
Target: ${_escapeHtml(options.scope!)}.
' - : ''; - return ''' -
- ${_escapeHtml(options.errorMessage!)} - $scope -
-'''; - } - if (options.hasFlash) { - final scope = options.hasScope - ? '
Target: ${_escapeHtml(options.scope!)}.
' - : ''; - return ''' -
- ${_escapeHtml(options.flashMessage!)} - $scope -
-'''; - } - return ''; -} - -int _compareQueues(QueueSummary a, QueueSummary b, TasksPageOptions options) { - switch (options.sortKey) { - case 'pending': - return a.pending.compareTo(b.pending); - case 'inflight': - return a.inflight.compareTo(b.inflight); - case 'dead': - return a.deadLetters.compareTo(b.deadLetters); - case 'queue': - default: - return a.queue.toLowerCase().compareTo(b.queue.toLowerCase()); + return buildWorkersContent(workers, queues, workersOptions); + case DashboardPage.namespaces: + return buildNamespacesContent( + queues: queues, + workers: workers, + taskStatuses: taskStatuses, + options: namespacesOptions, + defaultNamespace: defaultNamespace, + ); + case DashboardPage.workflows: + return buildWorkflowsContent( + taskStatuses: taskStatuses, + options: workflowsOptions, + ); + case DashboardPage.jobs: + return buildJobsContent(taskStatuses: taskStatuses, options: jobsOptions); } } -String _sortableHeader(String label, String key, TasksPageOptions options) { - final isActive = options.sortKey == key; - final descendingNext = isActive ? !options.descending : key != 'queue'; - final params = { - 'sort': key, - 'direction': descendingNext ? 'desc' : 'asc', - }; - if (options.hasFilter) { - params['queue'] = options.filter!; - } - final query = _buildQuery(params); - final indicator = isActive ? (options.descending ? '↓' : '↑') : ''; - final classes = isActive ? 'sort-link active' : 'sort-link'; - return '$label $indicator'; -} - -String _buildQuery(Map params) { - return params.entries - .map( - (entry) => - '${Uri.encodeQueryComponent(entry.key)}=${Uri.encodeQueryComponent(entry.value)}', - ) - .join('&'); -} - -String _escapeHtml(String value) { - return value - .replaceAll('&', '&') - .replaceAll('<', '<') - .replaceAll('>', '>') - .replaceAll('"', '"') - .replaceAll("'", '''); -} - -int _totalIsolates(List workers) { - return workers.fold(0, (total, status) => total + status.isolateCount); -} - -String _formatInt(int value) => _numberFormat.format(value); - -String _formatRelative(DateTime timestamp) { - final now = stemNow().toUtc(); - final diff = now.difference(timestamp.toUtc()); - if (diff < const Duration(seconds: 30)) return 'just now'; - if (diff < const Duration(minutes: 1)) { - return '${diff.inSeconds}s ago'; - } - if (diff < const Duration(hours: 1)) { - return '${diff.inMinutes}m ago'; - } - if (diff < const Duration(days: 1)) { - return '${diff.inHours}h ago'; - } - return '${diff.inDays}d ago'; +/// Builds inline expandable-row content for task table details. +String buildTaskInlineContent(DashboardTaskStatusEntry? task) { + return buildTaskInlinePanel(task); } diff --git a/packages/dashboard/lib/src/ui/event_templates.dart b/packages/dashboard/lib/src/ui/event_templates.dart index 013454e7..b796f58c 100644 --- a/packages/dashboard/lib/src/ui/event_templates.dart +++ b/packages/dashboard/lib/src/ui/event_templates.dart @@ -1,23 +1,25 @@ import 'package:intl/intl.dart'; import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/shared.dart' show escapeHtml; final _eventTimeFormat = DateFormat('HH:mm:ss'); /// Renders a dashboard event as an HTML list item. String renderEventItem(DashboardEvent event) { final timestamp = _eventTimeFormat.format(event.timestamp.toLocal()); - final metadataItems = event.metadata.entries - .map((entry) => '${entry.key}: ${entry.value}') - .join(); + final metadataItems = event.metadata.entries.map((entry) { + final value = entry.value == null ? 'null' : entry.value.toString(); + return '${escapeHtml(entry.key)}: ${escapeHtml(value)}'; + }).join(); final summary = event.summary != null && event.summary!.isNotEmpty - ? '

${event.summary}

' + ? '

${escapeHtml(event.summary!)}

' : ''; return ''' -
+
- ${event.title} + ${escapeHtml(event.title)} $timestamp $summary diff --git a/packages/dashboard/lib/src/ui/events.dart b/packages/dashboard/lib/src/ui/events.dart new file mode 100644 index 00000000..6cbefdca --- /dev/null +++ b/packages/dashboard/lib/src/ui/events.dart @@ -0,0 +1,33 @@ +// Public helper functions in this file are intentionally undocumented to keep +// UI template files lightweight. +// ignore_for_file: public_member_api_docs + +import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/event_templates.dart'; + +String buildEventsContent(List events) { + final items = events.isEmpty + ? ''' +
+

No events captured yet

+

+ Configure the dashboard event bridge to stream Stem signals (enqueue, start, retry, completion) into Redis. + Once connected, updates will appear here automatically via Turbo Streams. +

+
+ ''' + : events.map(renderEventItem).join(); + + return ''' + + +
+ $items +
+'''; +} diff --git a/packages/dashboard/lib/src/ui/failures.dart b/packages/dashboard/lib/src/ui/failures.dart new file mode 100644 index 00000000..fc282f93 --- /dev/null +++ b/packages/dashboard/lib/src/ui/failures.dart @@ -0,0 +1,180 @@ +// Public helper functions in this file are intentionally undocumented to keep +// UI template files lightweight. +// ignore_for_file: public_member_api_docs + +import 'package:stem/stem.dart' show TaskState; +import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/options.dart'; +import 'package:stem_dashboard/src/ui/shared.dart'; + +String buildFailuresContent( + List taskStatuses, + FailuresPageOptions options, +) { + final failures = taskStatuses + .where((task) => task.state == TaskState.failed) + .toList(growable: false); + final filtered = options.hasQueueFilter + ? failures + .where( + (task) => + task.queue.toLowerCase() == options.queue!.toLowerCase(), + ) + .toList(growable: false) + : failures; + + final groups = _groupFailures(filtered) + ..sort((a, b) { + final byCount = b.count.compareTo(a.count); + if (byCount != 0) return byCount; + return b.lastUpdated.compareTo(a.lastUpdated); + }); + + final affectedQueues = filtered.map((task) => task.queue).toSet().length; + final redirectPath = options.hasQueueFilter + ? '/failures?queue=${Uri.encodeQueryComponent(options.queue!)}' + : '/failures'; + + return ''' + + +${renderFailuresAlert(options)} + +
+ ${buildMetricCard('Failed statuses', formatInt(filtered.length), 'Terminal failed task statuses captured by the result backend.')} + ${buildMetricCard('Failure groups', formatInt(groups.length), 'Unique queue + task + error fingerprints.')} + ${buildMetricCard('Affected queues', formatInt(affectedQueues), 'Queues currently carrying failed statuses.')} +
+ +
+ + + + ${options.hasQueueFilter ? 'Clear' : ''} +
+ +
+
+

Failure Groups

+
+ + + + + + + + + + + + + ${groups.isEmpty ? ''' + + + +''' : groups.map((group) => ''' + + + + + + + + +''').join()} + +
QueueTaskFingerprintCountLatestRetry
No failed statuses match the current filter.
${escapeHtml(group.queue)}${escapeHtml(group.taskName)}${escapeHtml(group.errorFingerprint)}${formatInt(group.count)}${formatRelative(group.lastUpdated)} +
+ + + + +
+
+
+ +
+
+

Recent Failed Tasks

+
+ ${buildTaskStatusTable( + filtered.take(40).toList(growable: false), + options: DashboardTaskTableOptions( + showState: false, + emptyMessage: 'No individual failures to inspect.', + actionsBuilder: (task) => + buildTaskReplayAction(task, redirectPath: redirectPath), + ), + )} +
+'''; +} + +String renderFailuresAlert(FailuresPageOptions options) { + if (options.hasError) { + return '
${escapeHtml(options.errorMessage!)}
'; + } + if (options.hasFlash) { + return '
${escapeHtml(options.flashMessage!)}
'; + } + return ''; +} + +List<_FailureGroup> _groupFailures(List statuses) { + final groups = {}; + for (final task in statuses) { + final key = '${task.queue}|${task.taskName}|${task.errorFingerprint}'; + final existing = groups[key]; + if (existing == null) { + groups[key] = _FailureGroup( + queue: task.queue, + taskName: task.taskName, + errorFingerprint: task.errorFingerprint, + count: 1, + lastUpdated: task.updatedAt, + ); + continue; + } + groups[key] = existing.copyWith( + count: existing.count + 1, + lastUpdated: task.updatedAt.isAfter(existing.lastUpdated) + ? task.updatedAt + : existing.lastUpdated, + ); + } + return groups.values.toList(growable: false); +} + +class _FailureGroup { + const _FailureGroup({ + required this.queue, + required this.taskName, + required this.errorFingerprint, + required this.count, + required this.lastUpdated, + }); + + final String queue; + final String taskName; + final String errorFingerprint; + final int count; + final DateTime lastUpdated; + + int get replayLimit => count.clamp(1, 100); + + _FailureGroup copyWith({int? count, DateTime? lastUpdated}) { + return _FailureGroup( + queue: queue, + taskName: taskName, + errorFingerprint: errorFingerprint, + count: count ?? this.count, + lastUpdated: lastUpdated ?? this.lastUpdated, + ); + } +} diff --git a/packages/dashboard/lib/src/ui/jobs.dart b/packages/dashboard/lib/src/ui/jobs.dart new file mode 100644 index 00000000..164e0704 --- /dev/null +++ b/packages/dashboard/lib/src/ui/jobs.dart @@ -0,0 +1,106 @@ +// Public helper functions in this file are intentionally undocumented to keep +// UI template files lightweight. +// ignore_for_file: public_member_api_docs + +import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/options.dart'; +import 'package:stem_dashboard/src/ui/shared.dart'; + +String buildJobsContent({ + required List taskStatuses, + required JobsPageOptions options, +}) { + final jobs = buildJobSummaries(taskStatuses, limit: 500); + final taskFilter = options.task?.toLowerCase(); + final queueFilter = options.queue?.toLowerCase(); + final filtered = jobs.where((entry) { + final matchesTask = + taskFilter == null || + taskFilter.isEmpty || + entry.taskName.toLowerCase().contains(taskFilter); + final matchesQueue = + queueFilter == null || + queueFilter.isEmpty || + entry.sampleQueue.toLowerCase().contains(queueFilter); + return matchesTask && matchesQueue; + }).toList(growable: false); + + final total = filtered.fold(0, (sum, entry) => sum + entry.total); + final running = filtered.fold(0, (sum, entry) => sum + entry.running); + final failures = filtered.fold(0, (sum, entry) => sum + entry.failed); + + return ''' + + +
+ ${buildMetricCard('Task families', formatInt(filtered.length), 'Distinct task names represented in sampled statuses.')} + ${buildMetricCard('Sampled statuses', formatInt(total), 'Total status records currently included in this page sample.')} + ${buildMetricCard('Running', formatInt(running), 'Running statuses across filtered task families.')} + ${buildMetricCard('Failures', formatInt(failures), 'Failed statuses across filtered task families.')} +
+ +
+ + + + + + ${(options.hasTask || options.hasQueue) ? 'Clear' : ''} +
+ +
+
+

Job Summary

+
+ + + + + + + + + + + + + + + + + + ${filtered.isEmpty ? ''' + + + +''' : filtered.map((entry) => ''' + + + + + + + + + + + + + +''').join()} + +
TaskQueueSampledRunningSucceededFailedRetriedCancelledFailure ratioUpdatedActions
No jobs match the current filters.
${escapeHtml(entry.taskName)}${escapeHtml(entry.sampleQueue)}${formatInt(entry.total)}${formatInt(entry.running)}${formatInt(entry.succeeded)}${formatInt(entry.failed)}${formatInt(entry.retried)}${formatInt(entry.cancelled)}${(entry.failureRatio * 100).toStringAsFixed(1)}%${formatRelative(entry.lastUpdated)} + +
+
+'''; +} diff --git a/packages/dashboard/lib/src/ui/layout.dart b/packages/dashboard/lib/src/ui/layout.dart index 9b051deb..6f34c387 100644 --- a/packages/dashboard/lib/src/ui/layout.dart +++ b/packages/dashboard/lib/src/ui/layout.dart @@ -1,3 +1,5 @@ +import 'package:stem_dashboard/src/ui/paths.dart'; + /// Pages supported by the dashboard UI. enum DashboardPage { /// Overview landing page. @@ -6,9 +8,30 @@ enum DashboardPage { /// Task and queue details page. tasks('/tasks'), + /// Detailed view for a single task / workflow run. + taskDetail('/tasks/detail'), + + /// Failure diagnostics and grouped retry controls. + failures('/failures'), + + /// Global search and saved operational views. + search('/search'), + + /// Audit log for actions and alert deliveries. + audit('/audit'), + /// Event feed page. events('/events'), + /// Namespace-centric operational summary. + namespaces('/namespaces'), + + /// Workflow run-centric operational summary. + workflows('/workflows'), + + /// Task family/job-centric operational summary. + jobs('/jobs'), + /// Worker status page. workers('/workers'); @@ -25,26 +48,76 @@ enum DashboardPage { return 'Overview'; case DashboardPage.tasks: return 'Tasks'; + case DashboardPage.taskDetail: + return 'Task Detail'; + case DashboardPage.failures: + return 'Failures'; + case DashboardPage.search: + return 'Search'; + case DashboardPage.audit: + return 'Audit'; case DashboardPage.events: return 'Events'; + case DashboardPage.namespaces: + return 'Namespaces'; + case DashboardPage.workflows: + return 'Workflows'; + case DashboardPage.jobs: + return 'Jobs'; case DashboardPage.workers: return 'Workers'; } } + /// Whether this page should appear in sidebar navigation. + bool get showInNav => this != DashboardPage.taskDetail; + /// Browser title for this page. String get title => 'Stem Dashboard · $label'; } /// Renders the full HTML layout for a dashboard page. -String renderLayout(DashboardPage page, String content) { +String renderLayout( + DashboardPage page, + String content, { + String basePath = '', + String? streamPath, +}) { + final resolvedBasePath = normalizeDashboardBasePath(basePath); + final resolvedStreamPath = + streamPath ?? dashboardRoute(basePath, '/dash/streams'); return ''' - + ${page.title} + + + + + +
-
@@ -661,16 +2055,20 @@ $content '''; } -String _renderNav(DashboardPage active) { - return DashboardPage.values.map((page) => _navLink(page, active)).join('\n'); +String _renderNav(DashboardPage active, String basePath) { + return DashboardPage.values + .where((page) => page.showInNav) + .map((page) => _navLink(page, active, basePath)) + .join('\n'); } -String _navLink(DashboardPage page, DashboardPage active) { +String _navLink(DashboardPage page, DashboardPage active, String basePath) { final isActive = page == active; final classes = ['nav-link', if (isActive) 'active'].join(' '); final aria = isActive ? ' aria-current="page"' : ''; + final route = dashboardRoute(basePath, page.path); return ''' - + ${page.label} '''; diff --git a/packages/dashboard/lib/src/ui/namespaces.dart b/packages/dashboard/lib/src/ui/namespaces.dart new file mode 100644 index 00000000..49456f6f --- /dev/null +++ b/packages/dashboard/lib/src/ui/namespaces.dart @@ -0,0 +1,115 @@ +// Public helper functions in this file are intentionally undocumented to keep +// UI template files lightweight. +// ignore_for_file: public_member_api_docs + +import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/options.dart'; +import 'package:stem_dashboard/src/ui/shared.dart'; + +String buildNamespacesContent({ + required List queues, + required List workers, + required List taskStatuses, + required NamespacesPageOptions options, + required String defaultNamespace, +}) { + final snapshots = buildNamespaceSnapshots( + queues: queues, + workers: workers, + tasks: taskStatuses, + defaultNamespace: defaultNamespace, + ); + final namespaceFilter = options.namespace?.toLowerCase(); + final filtered = options.hasNamespace + ? snapshots + .where( + (entry) => + entry.namespace.toLowerCase().contains(namespaceFilter!), + ) + .toList(growable: false) + : snapshots; + + final totalPending = filtered.fold( + 0, + (sum, entry) => sum + entry.pending, + ); + final totalInflight = filtered.fold( + 0, + (sum, entry) => sum + entry.inflight, + ); + final totalFailed = filtered.fold( + 0, + (sum, entry) => sum + entry.failedTasks, + ); + + return ''' + + +
+ ${buildMetricCard('Namespaces', formatInt(filtered.length), 'Distinct namespaces detected from queue, worker, and task metadata.')} + ${buildMetricCard('Backlog', formatInt(totalPending), 'Pending envelopes across selected namespaces.')} + ${buildMetricCard('In-flight', formatInt(totalInflight), 'Current processing load across selected namespaces.')} + ${buildMetricCard('Failed tasks', formatInt(totalFailed), 'Recent terminal failures observed for selected namespaces.')} +
+ +
+ + + + ${options.hasNamespace ? 'Clear' : ''} +
+ +
+
+

Namespace Summary

+
+ + + + + + + + + + + + + + + + + ${filtered.isEmpty ? ''' + + + +''' : filtered.map((entry) => ''' + + + + + + + + + + + + +''').join()} + +
NamespaceQueuesWorkersBacklogIn-flightDead lettersRunningFailedWorkflow runsActions
No namespace data matches the current filter.
${escapeHtml(entry.namespace)}${formatInt(entry.queueCount)}${formatInt(entry.workerCount)}${formatInt(entry.pending)}${formatInt(entry.inflight)}${formatInt(entry.deadLetters)}${formatInt(entry.runningTasks)}${formatInt(entry.failedTasks)}${formatInt(entry.workflowRuns)} +
+ Tasks + Workers + Search +
+
+
+'''; +} diff --git a/packages/dashboard/lib/src/ui/options.dart b/packages/dashboard/lib/src/ui/options.dart new file mode 100644 index 00000000..78a90bcf --- /dev/null +++ b/packages/dashboard/lib/src/ui/options.dart @@ -0,0 +1,238 @@ +import 'package:stem/stem.dart' show TaskState; + +/// View options used by the tasks page renderer. +class TasksPageOptions { + /// Creates task page options with optional overrides. + const TasksPageOptions({ + this.sortKey = 'queue', + this.descending = false, + this.filter, + this.namespaceFilter, + this.taskFilter, + this.runId, + this.stateFilter, + this.flashKey, + this.errorKey, + this.page = 1, + this.pageSize = 25, + this.hasNextPage = false, + this.hasPreviousPage = false, + }); + + /// Sort key used for queue ordering. + final String sortKey; + + /// Whether sorting should be descending. + final bool descending; + + /// Optional queue filter text. + final String? filter; + + /// Optional namespace filter text. + final String? namespaceFilter; + + /// Optional task name filter text. + final String? taskFilter; + + /// Optional workflow run id filter. + final String? runId; + + /// Optional lifecycle state filter for recent task statuses. + final TaskState? stateFilter; + + /// Optional flash message key for UI alerts. + final String? flashKey; + + /// Optional error message key for UI alerts. + final String? errorKey; + + /// Current page number (1-based). + final int page; + + /// Number of task status records requested for a page. + final int pageSize; + + /// Whether another page exists after [page]. + final bool hasNextPage; + + /// Whether a page exists before [page]. + final bool hasPreviousPage; + + /// Whether a non-empty filter value is set. + bool get hasFilter => filter != null && filter!.isNotEmpty; + + /// Whether a non-empty namespace filter value is set. + bool get hasNamespaceFilter => + namespaceFilter != null && namespaceFilter!.isNotEmpty; + + /// Whether a non-empty task filter value is set. + bool get hasTaskFilter => taskFilter != null && taskFilter!.isNotEmpty; + + /// Whether a non-empty run id filter value is set. + bool get hasRunIdFilter => runId != null && runId!.isNotEmpty; + + /// Whether a status filter is set. + bool get hasStateFilter => stateFilter != null; + + /// Zero-based offset derived from [page] and [pageSize]. + int get offset => (page - 1) * pageSize; + + /// Whether pagination controls should be shown. + bool get hasPagination => hasPreviousPage || hasNextPage || page > 1; + + /// Creates a copy with selected fields replaced. + TasksPageOptions copyWith({ + String? sortKey, + bool? descending, + String? filter, + String? namespaceFilter, + String? taskFilter, + String? runId, + TaskState? stateFilter, + String? flashKey, + String? errorKey, + int? page, + int? pageSize, + bool? hasNextPage, + bool? hasPreviousPage, + }) { + return TasksPageOptions( + sortKey: sortKey ?? this.sortKey, + descending: descending ?? this.descending, + filter: filter ?? this.filter, + namespaceFilter: namespaceFilter ?? this.namespaceFilter, + taskFilter: taskFilter ?? this.taskFilter, + runId: runId ?? this.runId, + stateFilter: stateFilter ?? this.stateFilter, + flashKey: flashKey ?? this.flashKey, + errorKey: errorKey ?? this.errorKey, + page: page ?? this.page, + pageSize: pageSize ?? this.pageSize, + hasNextPage: hasNextPage ?? this.hasNextPage, + hasPreviousPage: hasPreviousPage ?? this.hasPreviousPage, + ); + } +} + +/// View options used by the workers page renderer. +class WorkersPageOptions { + /// Creates worker page options with optional overrides. + const WorkersPageOptions({ + this.flashMessage, + this.errorMessage, + this.scope, + this.namespaceFilter, + }); + + /// Optional flash message for the UI. + final String? flashMessage; + + /// Optional error message for the UI. + final String? errorMessage; + + /// Optional worker scope filter. + final String? scope; + + /// Optional namespace filter. + final String? namespaceFilter; + + /// Whether a non-empty flash message is set. + bool get hasFlash => flashMessage != null && flashMessage!.isNotEmpty; + + /// Whether a non-empty error message is set. + bool get hasError => errorMessage != null && errorMessage!.isNotEmpty; + + /// Whether a non-empty scope value is set. + bool get hasScope => scope != null && scope!.isNotEmpty; + + /// Whether a non-empty namespace filter value is set. + bool get hasNamespaceFilter => + namespaceFilter != null && namespaceFilter!.isNotEmpty; +} + +/// View options used by the failures page renderer. +class FailuresPageOptions { + /// Creates failure diagnostics options with optional overrides. + const FailuresPageOptions({this.queue, this.flashMessage, this.errorMessage}); + + /// Optional queue filter. + final String? queue; + + /// Optional flash message for the UI. + final String? flashMessage; + + /// Optional error message for the UI. + final String? errorMessage; + + /// Whether a non-empty queue filter is set. + bool get hasQueueFilter => queue != null && queue!.isNotEmpty; + + /// Whether a non-empty flash message is set. + bool get hasFlash => flashMessage != null && flashMessage!.isNotEmpty; + + /// Whether a non-empty error message is set. + bool get hasError => errorMessage != null && errorMessage!.isNotEmpty; +} + +/// View options used by the search page renderer. +class SearchPageOptions { + /// Creates search options with optional overrides. + const SearchPageOptions({this.query, this.scope = 'all'}); + + /// Free-text search query. + final String? query; + + /// Scope filter (`all`, `tasks`, `workers`, `queues`, `audit`). + final String scope; + + /// Whether a query is present. + bool get hasQuery => query != null && query!.trim().isNotEmpty; +} + +/// View options used by the namespaces page renderer. +class NamespacesPageOptions { + /// Creates namespace options with optional overrides. + const NamespacesPageOptions({this.namespace}); + + /// Optional namespace filter text. + final String? namespace; + + /// Whether namespace filter is set. + bool get hasNamespace => namespace != null && namespace!.isNotEmpty; +} + +/// View options used by the workflows page renderer. +class WorkflowsPageOptions { + /// Creates workflows options with optional overrides. + const WorkflowsPageOptions({this.workflow, this.runId}); + + /// Optional workflow name filter text. + final String? workflow; + + /// Optional run-id filter text. + final String? runId; + + /// Whether workflow filter is set. + bool get hasWorkflow => workflow != null && workflow!.isNotEmpty; + + /// Whether run-id filter is set. + bool get hasRunId => runId != null && runId!.isNotEmpty; +} + +/// View options used by the jobs page renderer. +class JobsPageOptions { + /// Creates jobs options with optional overrides. + const JobsPageOptions({this.task, this.queue}); + + /// Optional task-name filter. + final String? task; + + /// Optional queue filter. + final String? queue; + + /// Whether task filter is set. + bool get hasTask => task != null && task!.isNotEmpty; + + /// Whether queue filter is set. + bool get hasQueue => queue != null && queue!.isNotEmpty; +} diff --git a/packages/dashboard/lib/src/ui/overview.dart b/packages/dashboard/lib/src/ui/overview.dart new file mode 100644 index 00000000..5aa140ff --- /dev/null +++ b/packages/dashboard/lib/src/ui/overview.dart @@ -0,0 +1,449 @@ +// Public helper functions in this file are intentionally undocumented to keep +// UI template files lightweight. +// ignore_for_file: public_member_api_docs + +import 'package:stem/stem.dart' show TaskState, stemNow; +import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/shared.dart'; + +class OverviewSections { + const OverviewSections({ + required this.metrics, + required this.namespaces, + required this.topQueues, + required this.workflows, + required this.jobs, + required this.latency, + required this.recentTasks, + }); + + final String metrics; + final String namespaces; + final String topQueues; + final String workflows; + final String jobs; + final String latency; + final String recentTasks; +} + +String buildOverviewContent( + List queues, + List workers, + DashboardThroughput? throughput, + List taskStatuses, + String defaultNamespace, +) { + final sections = buildOverviewSections( + queues, + workers, + throughput, + taskStatuses, + defaultNamespace: defaultNamespace, + ); + return ''' + + +${sections.metrics} + +${sections.namespaces} + +${sections.topQueues} + +${sections.workflows} + +${sections.jobs} + +${sections.latency} + +${sections.recentTasks} +'''; +} + +OverviewSections buildOverviewSections( + List queues, + List workers, + DashboardThroughput? throughput, + List taskStatuses, + { + String defaultNamespace = 'stem', + } +) { + final totalPending = queues.fold( + 0, + (total, summary) => total + summary.pending, + ); + final totalInflight = queues.fold( + 0, + (total, summary) => total + summary.inflight, + ); + final totalDead = queues.fold( + 0, + (total, summary) => total + summary.deadLetters, + ); + final activeWorkers = workers.length; + final busiest = List.of( + queues, + )..sort((a, b) => (b.pending + b.inflight).compareTo(a.pending + a.inflight)); + final topQueues = busiest.take(5).toList(); + + final processedPerMin = throughput?.processedPerMinute ?? 0; + final enqueuedPerMin = throughput?.enqueuedPerMinute ?? 0; + final failedTasks = taskStatuses.where( + (task) => task.state == TaskState.failed, + ); + final workflowTasks = taskStatuses.where((task) => task.isWorkflowTask); + final runningTasks = taskStatuses.where( + (task) => task.state == TaskState.running, + ); + final now = stemNow().toUtc(); + const queuedStuckThreshold = Duration(minutes: 5); + const runningStuckThreshold = Duration(minutes: 15); + final stuckQueued = taskStatuses.where((task) { + if (task.state != TaskState.queued) return false; + return now.difference(task.createdAt.toUtc()) > queuedStuckThreshold; + }).length; + final stuckRunning = taskStatuses.where((task) { + if (task.state != TaskState.running) return false; + final anchor = task.startedAt ?? task.updatedAt.toUtc(); + return now.difference(anchor) > runningStuckThreshold; + }).length; + final queueLatency = _buildQueueLatency(taskStatuses); + final namespaces = buildNamespaceSnapshots( + queues: queues, + workers: workers, + tasks: taskStatuses, + defaultNamespace: defaultNamespace, + ); + final workflowRuns = buildWorkflowRunSummaries(taskStatuses, limit: 8); + final jobs = buildJobSummaries(taskStatuses, limit: 8); + final slaBreaches = queueLatency.fold( + 0, + (total, row) => total + row.slaBreaches, + ); + final throughputHint = throughput == null + ? 'Waiting for another snapshot to estimate rate.' + : 'Net change over the last ${throughput.interval.inSeconds}s.'; + + final metrics = + ''' +
+ ${buildMetricCard('Backlog (lag)', formatInt(totalPending), 'Undelivered tasks waiting across all queues.')} + ${buildMetricCard('Processing', formatInt(totalInflight), 'Active envelopes currently being executed.')} + ${buildMetricCard('Processed / min', formatRate(processedPerMin), throughputHint)} + ${buildMetricCard('Enqueued / min', formatRate(enqueuedPerMin), throughputHint)} + ${buildMetricCard('Dead letters', formatInt(totalDead), 'Items held in dead letter queues.')} + ${buildMetricCard('Active workers', formatInt(activeWorkers), 'Workers that published heartbeats within the retention window.')} + ${buildMetricCard('Running tasks', formatInt(runningTasks.length), 'Latest persisted statuses currently in running state.')} + ${buildMetricCard('Failed tasks', formatInt(failedTasks.length), 'Latest persisted statuses that ended in failure.')} + ${buildMetricCard('Workflow tasks', formatInt(workflowTasks.length), 'Recent task statuses tied to workflow execution.')} + ${buildMetricCard('Stuck queued', formatInt(stuckQueued), 'Queued longer than ${queuedStuckThreshold.inMinutes}m.')} + ${buildMetricCard('Stuck running', formatInt(stuckRunning), 'Running longer than ${runningStuckThreshold.inMinutes}m.')} + ${buildMetricCard('SLA breaches', formatInt(slaBreaches), 'Queue wait > 1m or processing > 5m in recent statuses.')} +
+'''; + + final namespaceSection = + ''' +
+
+

Namespaces

+
+ + + + + + + + + + + + + + + + ${namespaces.isEmpty ? ''' + + + +''' : namespaces.map((summary) => ''' + + + + + + + + + + + +''').join()} + +
NamespaceQueuesWorkersBacklogIn-flightDead lettersRunningFailedWorkflow runs
No namespaces discovered yet.
${escapeHtml(summary.namespace)}${formatInt(summary.queueCount)}${formatInt(summary.workerCount)}${formatInt(summary.pending)}${formatInt(summary.inflight)}${formatInt(summary.deadLetters)}${formatInt(summary.runningTasks)}${formatInt(summary.failedTasks)}${formatInt(summary.workflowRuns)}
+
+'''; + + final topQueuesSection = + ''' +
+ + + + + + + + + + + ${topQueues.isEmpty ? buildEmptyQueuesRow('No queues detected yet.') : topQueues.map(buildQueueTableRow).join()} + +
QueuePendingIn-flightDead letters
+
+'''; + + final workflowSection = + ''' +
+
+

Workflow Runs (Sample)

+
+ + + + + + + + + + + + + + + + ${workflowRuns.isEmpty ? ''' + + + +''' : workflowRuns.map((run) => ''' + + + + + + + + + + + +''').join()} + +
Run IDWorkflowStepQueuedRunningSucceededFailedCancelledUpdated
No workflow run metadata found in sampled task statuses.
${escapeHtml(run.runId)}${escapeHtml(run.workflowName)}${escapeHtml(run.lastStep ?? '—')}${formatInt(run.queued)}${formatInt(run.running)}${formatInt(run.succeeded)}${formatInt(run.failed)}${formatInt(run.cancelled)}${formatRelative(run.lastUpdated)}
+
+'''; + + final jobSection = + ''' +
+
+

Jobs (Task Families)

+
+ + + + + + + + + + + + + + + + ${jobs.isEmpty ? ''' + + + +''' : jobs.map((job) => ''' + + + + + + + + + + + +''').join()} + +
TaskQueueSampledRunningSucceededFailedRetriedFailure ratioUpdated
No task families discovered yet.
${escapeHtml(job.taskName)}${escapeHtml(job.sampleQueue)}${formatInt(job.total)}${formatInt(job.running)}${formatInt(job.succeeded)}${formatInt(job.failed)}${formatInt(job.retried)}${(job.failureRatio * 100).toStringAsFixed(1)}%${formatRelative(job.lastUpdated)}
+
+'''; + + final latencySection = + ''' +
+ + + + + + + + + + + + ${queueLatency.isEmpty ? ''' + + + +''' : queueLatency.map((row) => ''' + + + + + + + +''').join()} + +
QueueSamplesWait avgRun avgSLA breaches
No queue latency samples available yet.
${escapeHtml(row.queue)}${formatInt(row.samples)}${row.avgWaitLabel}${row.avgRunLabel}${formatInt(row.slaBreaches)}
+
+'''; + + final recentTasksSection = + ''' +
+ ${buildTaskStatusTable( + taskStatuses.take(8).toList(growable: false), + options: const DashboardTaskTableOptions( + showAttempt: false, + showError: false, + showActions: false, + emptyMessage: 'No persisted task statuses yet.', + ), + )} +
+'''; + + return OverviewSections( + metrics: metrics, + namespaces: namespaceSection, + topQueues: topQueuesSection, + workflows: workflowSection, + jobs: jobSection, + latency: latencySection, + recentTasks: recentTasksSection, + ); +} + +List<_QueueLatencyRow> _buildQueueLatency( + List tasks, +) { + final byQueue = {}; + const queueSla = Duration(minutes: 1); + const runSla = Duration(minutes: 5); + + for (final task in tasks) { + byQueue + .putIfAbsent( + task.queue, + () => _QueueLatencyAccumulator(queue: task.queue), + ) + .add( + wait: task.queueWait, + run: task.processingTime, + queueSla: queueSla, + runSla: runSla, + ); + } + + final rows = + byQueue.values.map((value) => value.build()).toList(growable: false) + ..sort((a, b) => b.slaBreaches.compareTo(a.slaBreaches)); + return rows.take(8).toList(growable: false); +} + +class _QueueLatencyAccumulator { + _QueueLatencyAccumulator({required this.queue}); + + final String queue; + final _wait = []; + final _run = []; + var _samples = 0; + var _breaches = 0; + + void add({ + required Duration? wait, + required Duration? run, + required Duration queueSla, + required Duration runSla, + }) { + _samples += 1; + if (wait != null) { + _wait.add(wait.inMilliseconds); + if (wait > queueSla) _breaches += 1; + } + if (run != null) { + _run.add(run.inMilliseconds); + if (run > runSla) _breaches += 1; + } + } + + _QueueLatencyRow build() { + return _QueueLatencyRow( + queue: queue, + samples: _samples, + avgWaitMs: _average(_wait), + avgRunMs: _average(_run), + slaBreaches: _breaches, + ); + } + + int _average(List values) { + if (values.isEmpty) return 0; + final total = values.fold(0, (sum, value) => sum + value); + return (total / values.length).round(); + } +} + +class _QueueLatencyRow { + const _QueueLatencyRow({ + required this.queue, + required this.samples, + required this.avgWaitMs, + required this.avgRunMs, + required this.slaBreaches, + }); + + final String queue; + final int samples; + final int avgWaitMs; + final int avgRunMs; + final int slaBreaches; + + String get avgWaitLabel => _formatMs(avgWaitMs); + String get avgRunLabel => _formatMs(avgRunMs); + + String _formatMs(int millis) { + if (millis <= 0) return '—'; + if (millis < 1000) return '${millis}ms'; + return '${(millis / 1000).toStringAsFixed(2)}s'; + } +} diff --git a/packages/dashboard/lib/src/ui/paths.dart b/packages/dashboard/lib/src/ui/paths.dart new file mode 100644 index 00000000..fe1e60e3 --- /dev/null +++ b/packages/dashboard/lib/src/ui/paths.dart @@ -0,0 +1,36 @@ +/// Normalizes a dashboard base path for route mounting. +String normalizeDashboardBasePath(String basePath) { + final trimmed = basePath.trim(); + if (trimmed.isEmpty || trimmed == '/') return ''; + final leading = trimmed.startsWith('/') ? trimmed : '/$trimmed'; + return leading.endsWith('/') + ? leading.substring(0, leading.length - 1) + : leading; +} + +/// Builds a dashboard route by combining [basePath] and [path]. +String dashboardRoute(String basePath, String path) { + final normalizedBasePath = normalizeDashboardBasePath(basePath); + final normalizedPath = path.startsWith('/') ? path : '/$path'; + if (normalizedBasePath.isEmpty) return normalizedPath; + if (normalizedPath == '/') return normalizedBasePath; + return '$normalizedBasePath$normalizedPath'; +} + +/// Prefixes root-relative HTML URL attributes with [basePath]. +String prefixDashboardUrlAttributes(String html, String basePath) { + final normalizedBasePath = normalizeDashboardBasePath(basePath); + if (normalizedBasePath.isEmpty) return html; + + return html.replaceAllMapped( + RegExp(r'''(href|action|value)=(["'])/(?!/)([^"']*)\2'''), + (match) { + final attribute = match.group(1)!; + final quote = match.group(2)!; + final remainder = match.group(3)!; + final path = remainder.isEmpty ? '/' : '/$remainder'; + final resolved = dashboardRoute(normalizedBasePath, path); + return '$attribute=$quote$resolved$quote'; + }, + ); +} diff --git a/packages/dashboard/lib/src/ui/search.dart b/packages/dashboard/lib/src/ui/search.dart new file mode 100644 index 00000000..91a20f0a --- /dev/null +++ b/packages/dashboard/lib/src/ui/search.dart @@ -0,0 +1,247 @@ +// Public helper functions in this file are intentionally undocumented to keep +// UI template files lightweight. +// ignore_for_file: public_member_api_docs + +import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/options.dart'; +import 'package:stem_dashboard/src/ui/shared.dart'; + +String buildSearchContent({ + required SearchPageOptions options, + required List queues, + required List workers, + required List taskStatuses, + required List auditEntries, +}) { + final queryRaw = options.query?.trim() ?? ''; + final query = queryRaw.toLowerCase(); + final taskMatches = _isScope(options.scope, 'tasks') + ? _searchTasks(taskStatuses, query) + : const []; + final workerMatches = _isScope(options.scope, 'workers') + ? _searchWorkers(workers, query) + : const []; + final queueMatches = _isScope(options.scope, 'queues') + ? _searchQueues(queues, query) + : const []; + final auditMatches = _isScope(options.scope, 'audit') + ? _searchAudit(auditEntries, query) + : const []; + + return ''' + + +
+ ${buildMetricCard('Task hits', formatInt(taskMatches.length), 'Matching task statuses by id/name/queue/workflow/error.')} + ${buildMetricCard('Worker hits', formatInt(workerMatches.length), 'Matching worker IDs and queue assignments.')} + ${buildMetricCard('Queue hits', formatInt(queueMatches.length), 'Matching queue names.')} + ${buildMetricCard('Audit hits', formatInt(auditMatches.length), 'Matching operator actions and alert records.')} +
+ +
+ + + + ${buildSearchScopeSelect(options.scope)} + +
+ +
+ + + + + + + + + ${buildSavedViewRow('/tasks?state=failed', 'Failed tasks', 'Recent terminal failures')} + ${buildSavedViewRow('/tasks?state=running', 'Running tasks', 'Investigate long-running tasks')} + ${buildSavedViewRow('/tasks?sort=pending&direction=desc', 'Backlog hotspots', 'Queues with highest pending load')} + ${buildSavedViewRow('/failures', 'Failure diagnostics', 'Grouped error fingerprints + replay controls')} + ${buildSavedViewRow('/workers', 'Worker health', 'Capacity and control plane visibility')} + ${buildSavedViewRow('/audit', 'Audit log', 'Operator actions and alert deliveries')} + +
Saved ViewsIntent
+
+ +${buildSearchTable( + title: 'Tasks', + emptyMessage: options.hasQuery ? 'No task results matched this query.' : 'Enter a query to search tasks.', + headers: const ['Task ID', 'Task', 'Queue', 'State', 'Updated'], + rows: taskMatches.take(40).map((task) => ''' + + ${escapeHtml(task.id)} + ${escapeHtml(task.taskName)} + ${escapeHtml(task.queue)} + ${buildTaskStatePill(task.state)} + ${formatRelative(task.updatedAt)} + +''').toList(growable: false), + )} + +${buildSearchTable( + title: 'Workers', + emptyMessage: options.hasQuery ? 'No worker results matched this query.' : 'Enter a query to search workers.', + headers: const ['Worker', 'Queues', 'Inflight', 'Heartbeat'], + rows: workerMatches.take(30).map((worker) => ''' + + ${escapeHtml(worker.workerId)} + ${worker.queues.isEmpty ? '—' : worker.queues.map((queue) => '${escapeHtml(queue.name)}').join(' ')} + ${formatInt(worker.inflight)} + ${formatRelative(worker.timestamp)} + +''').toList(growable: false), + )} + +${buildSearchTable( + title: 'Queues', + emptyMessage: options.hasQuery ? 'No queue results matched this query.' : 'Enter a query to search queues.', + headers: const ['Queue', 'Pending', 'Inflight', 'Dead letters'], + rows: queueMatches.take(30).map((queue) => ''' + + ${escapeHtml(queue.queue)} + ${formatInt(queue.pending)} + ${formatInt(queue.inflight)} + ${formatInt(queue.deadLetters)} + +''').toList(growable: false), + )} + +${buildSearchTable( + title: 'Audit', + emptyMessage: options.hasQuery ? 'No audit entries matched this query.' : 'Enter a query to search audit events.', + headers: const ['Time', 'Kind', 'Action', 'Status', 'Summary'], + rows: auditMatches.take(60).map((entry) => ''' + + ${formatRelative(entry.timestamp)} + ${escapeHtml(entry.kind)} + ${escapeHtml(entry.action)} + ${escapeHtml(entry.status)} + ${escapeHtml(entry.summary ?? '—')} + +''').toList(growable: false), + )} +'''; +} + +String buildSearchScopeSelect(String currentScope) { + String option(String value, String label) { + final selected = currentScope == value ? ' selected' : ''; + return ''; + } + + return ''' + +'''; +} + +String buildSavedViewRow(String href, String label, String intent) { + return ''' + + ${escapeHtml(label)} + ${escapeHtml(intent)} + +'''; +} + +String buildSearchTable({ + required String title, + required String emptyMessage, + required List headers, + required List rows, +}) { + return ''' +
+
+

${escapeHtml(title)}

+
+ + + + ${headers.map((header) => '').join()} + + + + ${rows.isEmpty ? '' : rows.join()} + +
${escapeHtml(header)}
${escapeHtml(emptyMessage)}
+
+'''; +} + +bool _isScope(String scope, String target) => scope == 'all' || scope == target; + +List _searchTasks( + List tasks, + String query, +) { + if (query.isEmpty) return const []; + return tasks + .where((task) { + return task.id.toLowerCase().contains(query) || + task.taskName.toLowerCase().contains(query) || + task.queue.toLowerCase().contains(query) || + (task.runId?.toLowerCase().contains(query) ?? false) || + (task.errorMessage?.toLowerCase().contains(query) ?? false); + }) + .toList(growable: false); +} + +List _searchWorkers(List workers, String query) { + if (query.isEmpty) return const []; + return workers + .where((worker) { + if (worker.workerId.toLowerCase().contains(query)) return true; + for (final queue in worker.queues) { + if (queue.name.toLowerCase().contains(query)) return true; + } + return false; + }) + .toList(growable: false); +} + +List _searchQueues(List queues, String query) { + if (query.isEmpty) return const []; + return queues + .where((queue) => queue.queue.toLowerCase().contains(query)) + .toList(growable: false); +} + +List _searchAudit( + List audits, + String query, +) { + if (query.isEmpty) return const []; + return audits + .where((entry) { + if (entry.action.toLowerCase().contains(query)) return true; + if (entry.summary?.toLowerCase().contains(query) ?? false) { + return true; + } + if (entry.actor?.toLowerCase().contains(query) ?? false) return true; + if (entry.status.toLowerCase().contains(query)) return true; + for (final value in entry.metadata.values) { + if (value?.toString().toLowerCase().contains(query) ?? false) { + return true; + } + } + return false; + }) + .toList(growable: false); +} diff --git a/packages/dashboard/lib/src/ui/shared.dart b/packages/dashboard/lib/src/ui/shared.dart new file mode 100644 index 00000000..c6cdb686 --- /dev/null +++ b/packages/dashboard/lib/src/ui/shared.dart @@ -0,0 +1,288 @@ +import 'package:intl/intl.dart'; +import 'package:stem/stem.dart' show TaskState, stemNow; +// Public helper functions in this file are intentionally undocumented to keep +// UI template files lightweight. +// ignore_for_file: public_member_api_docs + +import 'package:stem_dashboard/src/services/models.dart'; + +final dashboardNumberFormat = NumberFormat.decimalPattern(); + +String buildQueueTableRow(QueueSummary summary) { + final escapedQueue = escapeHtml(summary.queue); + return ''' + + $escapedQueue + ${formatInt(summary.pending)} + ${formatInt(summary.inflight)} + ${formatInt(summary.deadLetters)} + + + +
+
Pending ${formatInt(summary.pending)}
+
In-flight ${formatInt(summary.inflight)}
+
Dead letters ${formatInt(summary.deadLetters)}
+
Detailed DLQ previews render here once the replay control is wired.
+
+ + +'''; +} + +String buildMetricCard(String title, String value, String caption) { + return ''' +
+
+
$title
+
$value
+

$caption

+
+'''; +} + +String buildEmptyQueuesRow(String message) { + return ''' + + $message + +'''; +} + +String escapeHtml(String value) { + return value + .replaceAll('&', '&') + .replaceAll('<', '<') + .replaceAll('>', '>') + .replaceAll('"', '"') + .replaceAll("'", '''); +} + +int totalIsolates(List workers) { + return workers.fold(0, (total, status) => total + status.isolateCount); +} + +String formatInt(int value) => dashboardNumberFormat.format(value); + +String formatRate(double value) { + if (value <= 0) return '0'; + if (value < 1) return value.toStringAsFixed(2); + return dashboardNumberFormat.format(value.round()); +} + +String formatRelative(DateTime timestamp) { + final now = stemNow().toUtc(); + final diff = now.difference(timestamp.toUtc()); + if (diff < const Duration(seconds: 30)) return 'just now'; + if (diff < const Duration(minutes: 1)) { + return '${diff.inSeconds}s ago'; + } + if (diff < const Duration(hours: 1)) { + return '${diff.inMinutes}m ago'; + } + if (diff < const Duration(days: 1)) { + return '${diff.inHours}h ago'; + } + return '${diff.inDays}d ago'; +} + +String formatDateTime(DateTime? timestamp) { + if (timestamp == null) return '—'; + return timestamp.toUtc().toIso8601String(); +} + +String formatObject(Object? value) { + if (value == null) return 'null'; + if (value is String) return value; + return value.toString(); +} + +String taskStateLabel(TaskState state) { + switch (state) { + case TaskState.queued: + return 'Queued'; + case TaskState.running: + return 'Running'; + case TaskState.succeeded: + return 'Succeeded'; + case TaskState.failed: + return 'Failed'; + case TaskState.retried: + return 'Retried'; + case TaskState.cancelled: + return 'Cancelled'; + } +} + +String taskStateClass(TaskState state) { + switch (state) { + case TaskState.succeeded: + return 'success'; + case TaskState.failed: + return 'error'; + case TaskState.cancelled: + return 'error'; + case TaskState.running: + return 'running'; + case TaskState.retried: + return 'warning'; + case TaskState.queued: + return 'muted'; + } +} + +String buildTaskStatePill(TaskState state) { + return '${taskStateLabel(state)}'; +} + +class DashboardTaskTableOptions { + const DashboardTaskTableOptions({ + this.showState = true, + this.showAttempt = true, + this.showUpdated = true, + this.showError = true, + this.showActions = true, + this.expandableRows = false, + this.emptyMessage = 'No task statuses available.', + this.actionsBuilder, + }); + + final bool showState; + final bool showAttempt; + final bool showUpdated; + final bool showError; + final bool showActions; + final bool expandableRows; + final String emptyMessage; + final String Function(DashboardTaskStatusEntry task)? actionsBuilder; +} + +String buildTaskStatusTable( + List tasks, { + required DashboardTaskTableOptions options, +}) { + final headers = [ + 'Task ID', + 'Task', + 'Queue', + if (options.showState) 'State', + if (options.showAttempt) 'Attempt', + if (options.showUpdated) 'Updated', + if (options.showError) 'Error', + if (options.showActions) 'Actions', + ]; + + final rows = tasks.isEmpty + ? ''' + + ${escapeHtml(options.emptyMessage)} + +''' + : tasks.map((task) => _buildTaskStatusRow(task, options: options)).join(); + + return ''' + + + + ${headers.map((header) => '').join()} + + + + $rows + +
$header
+'''; +} + +String buildTaskLifecycleActions( + DashboardTaskStatusEntry task, { + required String redirectPath, +}) { + final encodedId = escapeHtml(task.id); + final encodedQueue = escapeHtml(task.queue); + final encodedRedirect = escapeHtml(redirectPath); + final controls = []; + + if (task.state == TaskState.running || task.state == TaskState.queued) { + controls.add(''' +
+ + + + + +
+'''); + } + + if (task.state == TaskState.failed || task.state == TaskState.cancelled) { + controls.add(''' +
+ + + + + +
+'''); + } + + if (controls.isEmpty) { + return '—'; + } + return '
${controls.join()}
'; +} + +String buildTaskReplayAction( + DashboardTaskStatusEntry task, { + required String redirectPath, + String label = 'Replay task', +}) { + return ''' +
+ + + + + +
+'''; +} + +String _buildTaskStatusRow( + DashboardTaskStatusEntry task, { + required DashboardTaskTableOptions options, +}) { + final escapedId = escapeHtml(task.id); + final detailUrl = '/tasks/detail?id=${Uri.encodeQueryComponent(task.id)}'; + final inlineTarget = 'task-inline-$escapedId'; + final rowClass = options.expandableRows ? 'task-row' : ''; + final rowAttrs = options.expandableRows + ? ' data-task-row="$escapedId" ' + 'data-task-inline-target="$inlineTarget" aria-expanded="false"' + : ''; + + return ''' + + $escapedId + ${escapeHtml(task.taskName)} + ${escapeHtml(task.queue)} + ${options.showState ? '${buildTaskStatePill(task.state)}' : ''} + ${options.showAttempt ? '${formatInt(task.attempt)}' : ''} + ${options.showUpdated ? '${formatRelative(task.updatedAt)}' : ''} + ${options.showError ? '${escapeHtml(_compactError(task))}' : ''} + ${options.showActions ? '${options.actionsBuilder?.call(task) ?? '—'}' : ''} + +'''; +} + +String _compactError(DashboardTaskStatusEntry task, {int max = 120}) { + final raw = task.errorMessage ?? (task.retryable ? 'retryable' : '—'); + final singleLine = raw.replaceAll(RegExp(r'\s+'), ' ').trim(); + return _truncate(singleLine, max); +} + +String _truncate(String input, int max) { + if (input.length <= max) return input; + return '${input.substring(0, max - 1)}…'; +} diff --git a/packages/dashboard/lib/src/ui/task_detail.dart b/packages/dashboard/lib/src/ui/task_detail.dart new file mode 100644 index 00000000..7d1b7df4 --- /dev/null +++ b/packages/dashboard/lib/src/ui/task_detail.dart @@ -0,0 +1,303 @@ +// Public helper functions in this file are intentionally undocumented to keep +// UI template files lightweight. +// ignore_for_file: public_member_api_docs + +import 'dart:convert'; + +import 'package:stem/stem.dart' show TaskState; +import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/shared.dart'; + +String buildTaskDetailContent( + DashboardTaskStatusEntry? task, + List runTimeline, + DashboardWorkflowRunSnapshot? workflowRun, + List workflowSteps, +) { + if (task == null) { + return ''' + +
+
+

Task not found

+

Try searching from the tasks page again.

+

Back to tasks

+
+
+'''; + } + + final timeline = List.from(runTimeline) + ..sort((a, b) => a.updatedAt.compareTo(b.updatedAt)); + final metadataEntries = task.meta.entries.toList(growable: false) + ..sort((a, b) => a.key.compareTo(b.key)); + + return ''' + + +
+ ${buildMetricCard('Task', escapeHtml(task.taskName), 'Handler name reported by task status metadata.')} + ${buildMetricCard('Queue', escapeHtml(task.queue), 'Queue where this task executed.')} + ${buildMetricCard('State', escapeHtml(task.state.name), 'Current persisted state.')} + ${buildMetricCard('Attempt', formatInt(task.attempt), 'Attempt count stored on the latest status write.')} +
+ +
+

Task actions

+
+ ${buildTaskActions(task)} +
+
+ +
+
+

Task Snapshot

+
+ + + + + + + + + +
Task ID${escapeHtml(task.id)}
Created${formatDateTime(task.createdAt)}
Updated${formatDateTime(task.updatedAt)}
Run ID${task.runId == null ? '—' : '${escapeHtml(task.runId!)}'}
Workflow${task.workflowName == null ? '—' : escapeHtml(task.workflowName!)}
Workflow Step${task.workflowStep == null ? '—' : escapeHtml(task.workflowStep!)}
+
+ +
+

Payload / Error

+
+
+
Payload
+
${escapeHtml(_prettyObject(task.payload))}
+
+
+
Error
+
${escapeHtml(_errorBlock(task))}
+
+
+
+ +
+
+

Metadata

+
+ + + + + + + + + ${metadataEntries.isEmpty ? ''' + + + +''' : metadataEntries.map((entry) => ''' + + + + +''').join()} + +
Metadata keyValue
No metadata fields persisted for this status.
${escapeHtml(entry.key)}${escapeHtml(_prettyObject(entry.value))}
+
+ +${buildWorkflowSection(task, workflowRun, workflowSteps, timeline)} +'''; +} + +String buildWorkflowSection( + DashboardTaskStatusEntry task, + DashboardWorkflowRunSnapshot? workflowRun, + List workflowSteps, + List timeline, +) { + if (task.runId == null) { + return ''' +
+
+

No workflow linkage

+

This task status does not include stem.workflow.runId metadata.

+
+
+'''; + } + + final runId = task.runId!; + return ''' +
+
+

Workflow Run

+
+ + + + + + + + + + + + ${workflowRun == null ? ''' + + + +''' : ''' + + + + + + + +'''} + +
Workflow runStatusCursorUpdatedWait topic
Workflow store is unavailable or this run is no longer persisted.
${escapeHtml(runId)}${escapeHtml(workflowRun.status.name)}${formatInt(workflowRun.cursor)}${formatDateTime(workflowRun.updatedAt)}${escapeHtml(workflowRun.waitTopic ?? '—')}
+
+ +
+
+

Workflow Steps

+
+ + + + + + + + + + + ${workflowSteps.isEmpty ? ''' + + + +''' : workflowSteps.map((step) => ''' + + + + + + +''').join()} + +
StepPositionCompletedValue
No persisted workflow step checkpoints found.
${escapeHtml(step.name)}${formatInt(step.position)}${formatDateTime(step.completedAt)}${escapeHtml(_prettyObject(step.value))}
+
+ +
+
+

Run Timeline

+
+ + + + + + + + + + + + + ${timeline.isEmpty ? ''' + + + +''' : timeline.map((entry) => ''' + + + + + + + + +''').join()} + +
Task IDTaskStepStateAttemptUpdated
No related statuses were found for run ${escapeHtml(runId)}.
${escapeHtml(entry.id)}${escapeHtml(entry.taskName)}${escapeHtml(entry.workflowStep ?? '—')}${buildTaskStatePill(entry.state)}${formatInt(entry.attempt)}${formatRelative(entry.updatedAt)}
+
+'''; +} + +String _prettyObject(Object? value) { + if (value == null) return 'null'; + if (value is String) return value; + if (value is num || value is bool) return value.toString(); + try { + return const JsonEncoder.withIndent(' ').convert(value); + } on Object { + return value.toString(); + } +} + +String _errorBlock(DashboardTaskStatusEntry task) { + if (task.errorMessage == null && + task.errorType == null && + task.errorStack == null) { + return 'No error payload recorded.'; + } + final buffer = StringBuffer(); + if (task.errorType != null && task.errorType!.isNotEmpty) { + buffer.writeln(task.errorType); + } + if (task.errorMessage != null && task.errorMessage!.isNotEmpty) { + buffer.writeln(task.errorMessage); + } + if (task.errorStack != null && task.errorStack!.isNotEmpty) { + buffer + ..writeln() + ..writeln(task.errorStack); + } + return buffer.toString().trim(); +} + +String buildTaskActions(DashboardTaskStatusEntry task) { + final redirect = '/tasks/detail?id=${Uri.encodeQueryComponent(task.id)}'; + final encodedId = escapeHtml(task.id); + final encodedQueue = escapeHtml(task.queue); + final actions = []; + + if (task.state == TaskState.running || task.state == TaskState.queued) { + actions.add(''' +
+ + + + + +
+'''); + } + + if (task.state == TaskState.failed || task.state == TaskState.cancelled) { + actions.add(''' +
+ + + + + +
+'''); + } + + actions.add( + 'Back to tasks', + ); + return actions.join(); +} diff --git a/packages/dashboard/lib/src/ui/tasks.dart b/packages/dashboard/lib/src/ui/tasks.dart new file mode 100644 index 00000000..466abe70 --- /dev/null +++ b/packages/dashboard/lib/src/ui/tasks.dart @@ -0,0 +1,602 @@ +// HTML template strings are kept on single lines for readability. +// ignore_for_file: lines_longer_than_80_chars, public_member_api_docs + +import 'dart:convert'; + +import 'package:stem/stem.dart' show TaskState, stemNow; +import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/options.dart'; +import 'package:stem_dashboard/src/ui/shared.dart'; + +String buildTasksContent( + List queues, + TasksPageOptions options, + List taskStatuses, +) { + var filtered = + options.hasFilter + ? queues + .where( + (summary) => summary.queue.toLowerCase().contains( + options.filter!.toLowerCase(), + ), + ) + .toList() + : List.of(queues) + ..sort((a, b) => compareQueues(a, b, options)); + if (options.descending) { + filtered = filtered.reversed.toList(); + } + + final totalQueues = filtered.length; + final dlqTotal = filtered.fold( + 0, + (total, summary) => total + summary.deadLetters, + ); + final runningCount = taskStatuses + .where((task) => task.state == TaskState.running) + .length; + final failedCount = taskStatuses + .where((task) => task.state == TaskState.failed) + .length; + final retriedCount = taskStatuses + .where((task) => task.state == TaskState.retried) + .length; + final now = stemNow().toUtc(); + const queuedStuckThreshold = Duration(minutes: 5); + const runningStuckThreshold = Duration(minutes: 15); + final stuckQueued = taskStatuses + .where((task) { + if (task.state != TaskState.queued) return false; + return now.difference(task.createdAt.toUtc()) > queuedStuckThreshold; + }) + .toList(growable: false); + final stuckRunning = taskStatuses + .where((task) { + if (task.state != TaskState.running) return false; + final anchor = task.startedAt ?? task.updatedAt.toUtc(); + return now.difference(anchor) > runningStuckThreshold; + }) + .toList(growable: false); + final queueLatency = _buildQueueLatency(taskStatuses); + final slaBreaches = queueLatency.fold( + 0, + (total, item) => total + item.slaBreaches, + ); + final taskActionRedirect = _buildTasksRedirect(options); + + return ''' + + +${renderTasksAlert(options)} + +
+ ${buildMetricCard('Tracked queues', formatInt(totalQueues), 'Queues discovered via Redis stream prefixes.')} + ${buildMetricCard('Dead letter size', formatInt(dlqTotal), 'Aggregate items across all dead letter queues.')} + ${buildMetricCard('Running tasks', formatInt(runningCount), 'Recent statuses currently processing.')} + ${buildMetricCard('Failed tasks', formatInt(failedCount), 'Recent statuses with terminal failures.')} + ${buildMetricCard('Retried tasks', formatInt(retriedCount), 'Recent statuses scheduled for another attempt.')} + ${buildMetricCard('Stuck queued', formatInt(stuckQueued.length), 'Queued beyond ${queuedStuckThreshold.inMinutes}m from initial status creation.')} + ${buildMetricCard('Stuck running', formatInt(stuckRunning.length), 'Running beyond ${runningStuckThreshold.inMinutes}m from start heartbeat metadata.')} + ${buildMetricCard('SLA breaches', formatInt(slaBreaches), 'Queue wait > 1m or processing > 5m across sampled statuses.')} +
+ +
+ + + + + + + + + + ${buildTaskStateFilterSelect(options)} + + ${buildTaskPageSizeSelect(options)} + + + + + ${(options.hasFilter || options.hasNamespaceFilter || options.hasTaskFilter || options.hasRunIdFilter) ? 'Clear' : ''} +
+ +
+
+

Queue Snapshot

+
+ + + + + + + + + + + ${filtered.isEmpty ? buildEmptyQueuesRow('No streams found for the configured namespace.') : filtered.map(buildQueueTableRow).join()} + +
${buildSortableHeader('Queue', 'queue', options)}${buildSortableHeader('Pending', 'pending', options)}${buildSortableHeader('In-flight', 'inflight', options)}${buildSortableHeader('Dead letters', 'dead', options)}
+
+ +
+
+

Latency Watch

+
+ + + + + + + + + + + + + + ${queueLatency.isEmpty ? ''' + + + +''' : queueLatency.map((latency) => ''' + + + + + + + + + +''').join()} + +
QueueSamplesWait avgWait p95Run avgRun p95SLA breaches
No latency samples yet. Task metadata must include started/completed timestamps.
${escapeHtml(latency.queue)}${formatInt(latency.samples)}${latency.avgWaitLabel}${latency.p95WaitLabel}${latency.avgRunLabel}${latency.p95RunLabel}${formatInt(latency.slaBreaches)}
+
+ +
+
+

Recent Statuses

+
+ ${buildTaskPaginationBar(options, taskStatuses.length)} + ${buildTaskStatusTable( + taskStatuses, + options: DashboardTaskTableOptions( + emptyMessage: 'No task statuses match the current filters.', + expandableRows: true, + actionsBuilder: (task) => + buildTaskLifecycleActions(task, redirectPath: taskActionRedirect), + ), + )} + ${buildTaskPaginationBar(options, taskStatuses.length)} +
+ +
+
+

Ad-hoc enqueue

+
+
+ + + + + +
+ +
+
+
+'''; +} + +String renderTasksAlert(TasksPageOptions options) { + String? message; + var type = 'success'; + switch (options.flashKey) { + case 'queued': + message = 'Task enqueued successfully.'; + default: + message = options.flashKey; + } + switch (options.errorKey) { + case 'missing-fields': + message = 'Queue and task name are required.'; + type = 'error'; + case 'invalid-payload': + message = 'Payload must be valid JSON describing an object.'; + type = 'error'; + case 'enqueue-failed': + message = + 'Failed to enqueue the task. Check the dashboard logs for details.'; + type = 'error'; + default: + if (options.errorKey != null && options.errorKey!.isNotEmpty) { + message = options.errorKey; + type = 'error'; + } + } + + if (message == null) return ''; + return '
${escapeHtml(message)}
'; +} + +int compareQueues(QueueSummary a, QueueSummary b, TasksPageOptions options) { + switch (options.sortKey) { + case 'pending': + return a.pending.compareTo(b.pending); + case 'inflight': + return a.inflight.compareTo(b.inflight); + case 'dead': + return a.deadLetters.compareTo(b.deadLetters); + case 'queue': + default: + return a.queue.toLowerCase().compareTo(b.queue.toLowerCase()); + } +} + +String buildSortableHeader(String label, String key, TasksPageOptions options) { + final isActive = options.sortKey == key; + final descendingNext = isActive ? !options.descending : key != 'queue'; + final params = { + 'sort': key, + 'direction': descendingNext ? 'desc' : 'asc', + 'page': '1', + 'pageSize': '${options.pageSize}', + }; + if (options.hasFilter) { + params['queue'] = options.filter!; + } + if (options.hasNamespaceFilter) { + params['namespace'] = options.namespaceFilter!; + } + if (options.hasTaskFilter) { + params['task'] = options.taskFilter!; + } + if (options.hasRunIdFilter) { + params['runId'] = options.runId!; + } + if (options.hasStateFilter) { + params['state'] = options.stateFilter!.name; + } + final query = buildQuery(params); + final indicator = isActive ? (options.descending ? '↓' : '↑') : ''; + final classes = isActive ? 'sort-link active' : 'sort-link'; + return '$label $indicator'; +} + +String buildQuery(Map params) { + return params.entries + .map( + (entry) => + '${Uri.encodeQueryComponent(entry.key)}=${Uri.encodeQueryComponent(entry.value)}', + ) + .join('&'); +} + +String buildTaskStateFilterSelect(TasksPageOptions options) { + final current = options.stateFilter?.name; + String option(TaskState? state, String label) { + final value = state?.name ?? ''; + final selected = current == value ? ' selected' : ''; + return ''; + } + + return ''' + +'''; +} + +String buildTaskPageSizeSelect(TasksPageOptions options) { + const sizes = [25, 50, 100, 200]; + final normalized = sizes.contains(options.pageSize) ? options.pageSize : 25; + final optionsHtml = sizes.map((size) { + final selected = normalized == size ? ' selected' : ''; + return ''; + }).join(); + return ''; +} + +String buildTaskPaginationBar(TasksPageOptions options, int rowsOnPage) { + if (!options.hasPagination) { + return ''; + } + final start = rowsOnPage == 0 ? 0 : options.offset + 1; + final end = options.offset + rowsOnPage; + final baseParams = { + 'sort': options.sortKey, + 'direction': options.descending ? 'desc' : 'asc', + 'pageSize': '${options.pageSize}', + }; + if (options.hasFilter) { + baseParams['queue'] = options.filter!; + } + if (options.hasNamespaceFilter) { + baseParams['namespace'] = options.namespaceFilter!; + } + if (options.hasTaskFilter) { + baseParams['task'] = options.taskFilter!; + } + if (options.hasRunIdFilter) { + baseParams['runId'] = options.runId!; + } + if (options.hasStateFilter) { + baseParams['state'] = options.stateFilter!.name; + } + + final previousLink = options.hasPreviousPage + ? 'Previous' + : 'Previous'; + final nextLink = options.hasNextPage + ? 'Next' + : 'Next'; + + return ''' +
+ Page ${options.page} • Showing $start-$end +
+ $previousLink + $nextLink +
+
+'''; +} + +String _buildTasksRedirect(TasksPageOptions options) { + final params = {}; + if (options.hasFilter) { + params['queue'] = options.filter!; + } + if (options.hasNamespaceFilter) { + params['namespace'] = options.namespaceFilter!; + } + if (options.hasTaskFilter) { + params['task'] = options.taskFilter!; + } + if (options.hasRunIdFilter) { + params['runId'] = options.runId!; + } + if (options.hasStateFilter) { + params['state'] = options.stateFilter!.name; + } + if (params.isEmpty) return '/tasks'; + return '/tasks?${buildQuery(params)}'; +} + +String buildTaskInlinePanel(DashboardTaskStatusEntry? task) { + if (task == null) { + return ''' +
+

Task detail could not be loaded from the result backend.

+
+'''; + } + + final detailUrl = '/tasks/detail?id=${Uri.encodeQueryComponent(task.id)}'; + final metadataEntries = task.meta.entries.toList(growable: false) + ..sort((a, b) => a.key.compareTo(b.key)); + final metadataPreview = metadataEntries.isEmpty + ? 'No metadata fields were persisted.' + : '
${metadataEntries.take(8).map((entry) => ''' +
+ ${escapeHtml(entry.key)} + ${escapeHtml(_compactObject(entry.value))} +
+''').join()}
${metadataEntries.length > 8 ? '

+${metadataEntries.length - 8} more metadata fields in full detail view.

' : ''}'; + final payloadPreview = escapeHtml(_compactObject(task.payload, max: 480)); + final errorPreview = escapeHtml(_buildErrorPreview(task)); + final workflowSummary = task.runId == null + ? 'No workflow linkage' + : '${escapeHtml(task.runId!)}${task.workflowStep == null ? '' : ' · ${escapeHtml(task.workflowStep!)}'}'; + + return ''' +
+
+
Timestamps
+
Created: ${escapeHtml(formatDateTime(task.createdAt))}
+
Started: ${escapeHtml(formatDateTime(task.startedAt))}
+
Finished: ${escapeHtml(formatDateTime(task.finishedAt))}
+
+
+
Workflow
+
$workflowSummary
+
Updated ${escapeHtml(formatRelative(task.updatedAt))}
+
+
+
Error snapshot
+
$errorPreview
+
+
+
Payload snapshot
+
$payloadPreview
+
+
+
+
Metadata
+ $metadataPreview +
+ +'''; +} + +String _buildErrorPreview(DashboardTaskStatusEntry task) { + if (task.errorMessage == null && + task.errorType == null && + task.errorStack == null) { + return 'No error payload recorded.'; + } + final buffer = StringBuffer(); + if (task.errorType != null && task.errorType!.isNotEmpty) { + buffer.writeln(task.errorType); + } + if (task.errorMessage != null && task.errorMessage!.isNotEmpty) { + buffer.writeln(task.errorMessage); + } + if (task.errorStack != null && task.errorStack!.isNotEmpty) { + final stack = _truncate(task.errorStack!, 360); + if (stack.isNotEmpty) { + buffer + ..writeln() + ..writeln(stack); + } + } + return buffer.toString().trim(); +} + +String _compactObject(Object? value, {int max = 180}) { + final pretty = _prettyObject(value).replaceAll('\n', ' '); + return _truncate(pretty, max); +} + +String _truncate(String input, int max) { + if (input.length <= max) return input; + return '${input.substring(0, max)}...'; +} + +String _prettyObject(Object? value) { + if (value == null) return 'null'; + if (value is String) return value; + if (value is num || value is bool) return value.toString(); + try { + return const JsonEncoder.withIndent(' ').convert(value); + } on Object { + return value.toString(); + } +} + +List<_QueueLatencyRow> _buildQueueLatency( + List tasks, +) { + final byQueue = {}; + const queueSla = Duration(minutes: 1); + const runSla = Duration(minutes: 5); + + for (final task in tasks) { + final bucket = byQueue.putIfAbsent( + task.queue, + () => _QueueLatencyAccumulator(queue: task.queue), + ); + final wait = task.queueWait; + final run = task.processingTime; + bucket.add(wait: wait, run: run, queueSla: queueSla, runSla: runSla); + } + + final rows = + byQueue.values.map((value) => value.build()).toList(growable: false) + ..sort((a, b) => b.slaBreaches.compareTo(a.slaBreaches)); + return rows; +} + +class _QueueLatencyAccumulator { + _QueueLatencyAccumulator({required this.queue}); + + final String queue; + final _waitSamples = []; + final _runSamples = []; + var _samples = 0; + var _breaches = 0; + + void add({ + required Duration? wait, + required Duration? run, + required Duration queueSla, + required Duration runSla, + }) { + _samples += 1; + if (wait != null) { + _waitSamples.add(wait.inMilliseconds); + if (wait > queueSla) _breaches += 1; + } + if (run != null) { + _runSamples.add(run.inMilliseconds); + if (run > runSla) _breaches += 1; + } + } + + _QueueLatencyRow build() { + return _QueueLatencyRow( + queue: queue, + samples: _samples, + avgWaitMs: _average(_waitSamples), + p95WaitMs: _percentile(_waitSamples, 0.95), + avgRunMs: _average(_runSamples), + p95RunMs: _percentile(_runSamples, 0.95), + slaBreaches: _breaches, + ); + } + + int _average(List values) { + if (values.isEmpty) return 0; + final total = values.fold(0, (sum, value) => sum + value); + return (total / values.length).round(); + } + + int _percentile(List values, double p) { + if (values.isEmpty) return 0; + final sorted = List.from(values)..sort(); + final index = ((sorted.length - 1) * p).round().clamp(0, sorted.length - 1); + return sorted[index]; + } +} + +class _QueueLatencyRow { + const _QueueLatencyRow({ + required this.queue, + required this.samples, + required this.avgWaitMs, + required this.p95WaitMs, + required this.avgRunMs, + required this.p95RunMs, + required this.slaBreaches, + }); + + final String queue; + final int samples; + final int avgWaitMs; + final int p95WaitMs; + final int avgRunMs; + final int p95RunMs; + final int slaBreaches; + + String get avgWaitLabel => _formatMs(avgWaitMs); + String get p95WaitLabel => _formatMs(p95WaitMs); + String get avgRunLabel => _formatMs(avgRunMs); + String get p95RunLabel => _formatMs(p95RunMs); + + String _formatMs(int millis) { + if (millis <= 0) return '—'; + if (millis < 1000) return '${millis}ms'; + return '${(millis / 1000).toStringAsFixed(2)}s'; + } +} diff --git a/packages/dashboard/lib/src/ui/workers.dart b/packages/dashboard/lib/src/ui/workers.dart new file mode 100644 index 00000000..f580faa5 --- /dev/null +++ b/packages/dashboard/lib/src/ui/workers.dart @@ -0,0 +1,381 @@ +// Public helper functions in this file are intentionally undocumented to keep +// UI template files lightweight. +// ignore_for_file: public_member_api_docs + +import 'dart:math' as math; + +import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/options.dart'; +import 'package:stem_dashboard/src/ui/shared.dart'; + +String buildWorkersContent( + List workers, + List queues, + WorkersPageOptions options, +) { + final filteredWorkers = options.hasNamespaceFilter + ? workers + .where( + (worker) => + worker.namespace.toLowerCase() == + options.namespaceFilter!.toLowerCase(), + ) + .toList(growable: false) + : workers; + final healthyWorkers = filteredWorkers.where((worker) { + return worker.age <= const Duration(minutes: 2); + }).length; + + final busy = filteredWorkers.where((worker) => worker.inflight > 0).length; + final overloaded = filteredWorkers.where((worker) { + final cap = worker.isolateCount <= 0 ? 1 : worker.isolateCount; + return worker.inflight / cap >= 0.8; + }).length; + final queueCoverage = _buildQueueCoverage(filteredWorkers, queues); + final imbalance = _computeImbalance(queueCoverage); + final queueMap = {for (final summary in queues) summary.queue: summary}; + + return ''' + + +${renderWorkersAlert(options)} + +
+ ${buildMetricCard('Healthy workers', formatInt(healthyWorkers), 'Heartbeats received within the last two minutes.')} + ${buildMetricCard('Busy workers', formatInt(busy), 'Workers currently processing at least one task.')} + ${buildMetricCard('High saturation', formatInt(overloaded), 'Workers at or above 80% inflight-to-isolate capacity.')} + ${buildMetricCard('Queue imbalance', imbalance.toStringAsFixed(2), 'Stddev of worker coverage across discovered queues.')} + ${buildMetricCard('Isolates in use', formatInt(totalIsolates(filteredWorkers)), 'Sum of worker isolates across the cluster.')} +
+ +
+ + + + ${options.hasNamespaceFilter ? 'Clear' : ''} +
+ +
+
+

Worker Heartbeats

+
+ + + + + + + + + + + + + + ${filteredWorkers.isEmpty ? ''' + + + + ''' : filteredWorkers.map((worker) => buildWorkerRow(worker, namespaceFilter: options.namespaceFilter)).join()} + +
WorkerNamespaceQueuesInflightSaturationLast heartbeatActions
No heartbeats detected for namespace "${escapeHtml(options.namespaceFilter ?? 'stem')}".
+
+ +${_buildQueueCoverageSection(queueCoverage)} + +${buildClusterControls(namespaceFilter: options.namespaceFilter)} + +${buildQueueRecoverySection(queueMap, namespaceFilter: options.namespaceFilter)} +'''; +} + +String buildWorkerRow(WorkerStatus status, {String? namespaceFilter}) { + final queues = status.queues.isEmpty + ? '—' + : status.queues + .map( + (queue) => '${escapeHtml(queue.name)}', + ) + .join(' '); + return ''' + + ${escapeHtml(status.workerId)} + ${escapeHtml(status.namespace)} + $queues + ${formatInt(status.inflight)} + ${buildSaturationPill(status)} + ${formatRelative(status.timestamp)} + +
+ ${buildWorkerActionButton('Ping', 'ping', status.workerId, namespaceFilter: namespaceFilter)} + ${buildWorkerActionButton('Pause', 'pause', status.workerId, namespaceFilter: namespaceFilter)} + ${buildWorkerActionButton('Shutdown', 'shutdown', status.workerId, namespaceFilter: namespaceFilter)} +
+ + +'''; +} + +String buildSaturationPill(WorkerStatus status) { + final capacity = status.isolateCount <= 0 ? 1 : status.isolateCount; + final ratio = status.inflight / capacity; + final label = '${(ratio * 100).round()}%'; + final style = ratio >= 0.8 + ? 'error' + : ratio >= 0.5 + ? 'warning' + : 'success'; + return '$label'; +} + +String buildWorkerActionButton( + String label, + String action, + String workerId, { + String? namespaceFilter, +}) { + return ''' +
+ + + ${namespaceFilter == null || namespaceFilter.isEmpty ? '' : ''} + +
+'''; +} + +String buildClusterControls({String? namespaceFilter}) { + return ''' +
+

Cluster controls

+

Broadcast commands to all workers in the namespace.

+
+ ${buildClusterActionButton('Ping all workers', 'ping', namespaceFilter: namespaceFilter)} + ${buildClusterActionButton('Pause all workers', 'pause', namespaceFilter: namespaceFilter)} + ${buildClusterActionButton('Shutdown all workers', 'shutdown', namespaceFilter: namespaceFilter)} +
+
+'''; +} + +String buildClusterActionButton( + String label, + String action, { + String? namespaceFilter, +}) { + return ''' +
+ + + ${namespaceFilter == null || namespaceFilter.isEmpty ? '' : ''} + +
+'''; +} + +String buildQueueRecoverySection( + Map queues, { + String? namespaceFilter, +}) { + if (queues.isEmpty) return ''; + final rows = queues.values.toList() + ..sort((a, b) => a.queue.compareTo(b.queue)); + return ''' +
+
+

Queue Recovery

+
+ + + + + + + + + + + ${rows.map((summary) => buildQueueRecoveryRow(summary, namespaceFilter: namespaceFilter)).join()} + +
QueuePendingDead lettersReplay
+
+'''; +} + +String _buildQueueCoverageSection(List<_QueueCoverageRow> rows) { + if (rows.isEmpty) return ''; + return ''' +
+
+

Queue Coverage

+
+ + + + + + + + + + + + ${rows.map((row) => ''' + + + + + + + +''').join()} + +
QueueWorkers assignedInflight loadPendingDead letters
${escapeHtml(row.queue)}${formatInt(row.workerCount)}${formatInt(row.inflight)}${formatInt(row.pending)}${formatInt(row.deadLetters)}
+
+'''; +} + +List<_QueueCoverageRow> _buildQueueCoverage( + List workers, + List queues, +) { + final map = {}; + for (final queue in queues) { + map.putIfAbsent( + queue.queue, + () => _QueueCoverageRowBuilder( + queue: queue.queue, + pending: queue.pending, + deadLetters: queue.deadLetters, + ), + ); + } + for (final worker in workers) { + for (final queue in worker.queues) { + final entry = map.putIfAbsent( + queue.name, + () => _QueueCoverageRowBuilder(queue: queue.name), + ); + entry.workers.add(worker.workerId); + entry.inflight += queue.inflight; + } + } + final rows = map.values.map((value) => value.build()).toList(growable: false) + ..sort((a, b) { + final byWorkers = a.workerCount.compareTo(b.workerCount); + if (byWorkers != 0) return byWorkers; + return b.pending.compareTo(a.pending); + }); + return rows; +} + +double _computeImbalance(List<_QueueCoverageRow> rows) { + if (rows.length <= 1) return 0; + final values = rows.map((row) => row.workerCount.toDouble()).toList(); + final mean = values.reduce((a, b) => a + b) / values.length; + var variance = 0.0; + for (final value in values) { + final delta = value - mean; + variance += delta * delta; + } + variance /= values.length; + return math.sqrt(variance); +} + +class _QueueCoverageRowBuilder { + _QueueCoverageRowBuilder({ + required this.queue, + this.pending = 0, + this.deadLetters = 0, + }); + + final String queue; + final Set workers = {}; + int inflight = 0; + int pending; + int deadLetters; + + _QueueCoverageRow build() { + return _QueueCoverageRow( + queue: queue, + workerCount: workers.length, + inflight: inflight, + pending: pending, + deadLetters: deadLetters, + ); + } +} + +class _QueueCoverageRow { + const _QueueCoverageRow({ + required this.queue, + required this.workerCount, + required this.inflight, + required this.pending, + required this.deadLetters, + }); + + final String queue; + final int workerCount; + final int inflight; + final int pending; + final int deadLetters; +} + +String buildQueueRecoveryRow(QueueSummary summary, {String? namespaceFilter}) { + final limitDefault = summary.deadLetters <= 0 + ? 50 + : summary.deadLetters.clamp(1, 50); + final redirect = namespaceFilter == null || namespaceFilter.isEmpty + ? '/workers' + : '/workers?namespace=${Uri.encodeQueryComponent(namespaceFilter)}'; + final action = summary.deadLetters == 0 + ? 'No dead letters' + : ''' +
+ + + + +
+ '''; + return ''' + + ${escapeHtml(summary.queue)} + ${formatInt(summary.pending)} + ${formatInt(summary.deadLetters)} + $action + +'''; +} + +String renderWorkersAlert(WorkersPageOptions options) { + if (options.hasError) { + final scope = options.hasScope + ? '
Target: ${escapeHtml(options.scope!)}.
' + : ''; + return ''' +
+ ${escapeHtml(options.errorMessage!)} + $scope +
+'''; + } + if (options.hasFlash) { + final scope = options.hasScope + ? '
Target: ${escapeHtml(options.scope!)}.
' + : ''; + return ''' +
+ ${escapeHtml(options.flashMessage!)} + $scope +
+'''; + } + return ''; +} diff --git a/packages/dashboard/lib/src/ui/workflows.dart b/packages/dashboard/lib/src/ui/workflows.dart new file mode 100644 index 00000000..a147e95e --- /dev/null +++ b/packages/dashboard/lib/src/ui/workflows.dart @@ -0,0 +1,104 @@ +// Public helper functions in this file are intentionally undocumented to keep +// UI template files lightweight. +// ignore_for_file: public_member_api_docs + +import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/options.dart'; +import 'package:stem_dashboard/src/ui/shared.dart'; + +String buildWorkflowsContent({ + required List taskStatuses, + required WorkflowsPageOptions options, +}) { + final runs = buildWorkflowRunSummaries(taskStatuses, limit: 400); + final workflowFilter = options.workflow?.toLowerCase(); + final runFilter = options.runId?.toLowerCase(); + final filtered = runs.where((entry) { + final matchesWorkflow = + workflowFilter == null || + workflowFilter.isEmpty || + entry.workflowName.toLowerCase().contains(workflowFilter); + final matchesRun = + runFilter == null || + runFilter.isEmpty || + entry.runId.toLowerCase().contains(runFilter); + return matchesWorkflow && matchesRun; + }).toList(growable: false); + + final running = filtered.fold(0, (sum, entry) => sum + entry.running); + final failed = filtered.fold(0, (sum, entry) => sum + entry.failed); + final queued = filtered.fold(0, (sum, entry) => sum + entry.queued); + + return ''' + + +
+ ${buildMetricCard('Runs (sample)', formatInt(filtered.length), 'Distinct workflow run IDs currently visible in task status history.')} + ${buildMetricCard('Queued steps', formatInt(queued), 'Queued or retried statuses across sampled runs.')} + ${buildMetricCard('Running steps', formatInt(running), 'Statuses currently executing inside workflow runs.')} + ${buildMetricCard('Failed steps', formatInt(failed), 'Failed statuses mapped to workflow runs.')} +
+ +
+ + + + + + ${(options.hasWorkflow || options.hasRunId) ? 'Clear' : ''} +
+ +
+
+

Workflow Runs

+
+ + + + + + + + + + + + + + + + + ${filtered.isEmpty ? ''' + + + +''' : filtered.map((entry) => ''' + + + + + + + + + + + + +''').join()} + +
Run IDWorkflowLast stepQueuedRunningSucceededFailedCancelledUpdatedActions
No workflow runs match the current filters.
${escapeHtml(entry.runId)}${escapeHtml(entry.workflowName)}${escapeHtml(entry.lastStep ?? '—')}${formatInt(entry.queued)}${formatInt(entry.running)}${formatInt(entry.succeeded)}${formatInt(entry.failed)}${formatInt(entry.cancelled)}${formatRelative(entry.lastUpdated)} + +
+
+'''; +} diff --git a/packages/dashboard/pubspec.yaml b/packages/dashboard/pubspec.yaml index d702f2c0..083a814e 100644 --- a/packages/dashboard/pubspec.yaml +++ b/packages/dashboard/pubspec.yaml @@ -5,6 +5,7 @@ publish_to: "none" environment: sdk: ">=3.9.2 <4.0.0" +resolution: workspace dependencies: intl: ^0.20.2 meta: ^1.18.0 @@ -27,14 +28,5 @@ dev_dependencies: dependency_overrides: analyzer: ^10.0.1 - stem: - path: ../stem - stem_cli: - path: ../stem_cli - stem_postgres: - path: ../stem_postgres - stem_redis: - path: ../stem_redis - stem_sqlite: - path: ../stem_sqlite - timezone: 0.11.0 + artisanal: ^0.2.0 + diff --git a/packages/dashboard/tailwind.config.js b/packages/dashboard/tailwind.config.js new file mode 100644 index 00000000..1a3f0880 --- /dev/null +++ b/packages/dashboard/tailwind.config.js @@ -0,0 +1,24 @@ +/** @type {import('tailwindcss').Config} */ +module.exports = { + content: [ + './lib/src/ui/**/*.dart', + './lib/src/server.dart', + ], + theme: { + extend: { + colors: { + stem: { + 950: '#0f172a', + 900: '#111827', + 800: '#1e293b', + 500: '#38bdf8', + 400: '#7dd3fc', + }, + }, + fontFamily: { + sans: ['Manrope', 'system-ui', 'sans-serif'], + }, + }, + }, + plugins: [], +}; diff --git a/packages/dashboard/test/dashboard_browser_test.dart b/packages/dashboard/test/dashboard_browser_test.dart index 9c315334..a073ba29 100644 --- a/packages/dashboard/test/dashboard_browser_test.dart +++ b/packages/dashboard/test/dashboard_browser_test.dart @@ -6,7 +6,8 @@ import 'package:server_testing/src/browser/bootstrap/driver/driver_manager.dart' as driver_manager; import 'package:server_testing/src/browser/interfaces/browser_type.dart' show BrowserLaunchOptions; -import 'package:stem/stem.dart' show DeadLetterEntry, DeadLetterReplayResult; +import 'package:stem/stem.dart' + show DeadLetterEntry, DeadLetterReplayResult, TaskState; import 'package:stem_dashboard/src/server.dart'; import 'package:stem_dashboard/src/services/models.dart'; import 'package:stem_dashboard/src/services/stem_service.dart'; @@ -17,16 +18,23 @@ class _FakeDashboardService implements DashboardDataSource { _FakeDashboardService({ required List queues, required List workers, + List taskStatuses = const [], }) : _queues = queues, - _workers = workers; + _workers = workers, + _taskStatuses = taskStatuses; List _queues; List _workers; + List _taskStatuses; EnqueueRequest? lastEnqueue; final List controlCommands = []; String? lastReplayQueue; int? lastReplayLimit; bool? lastReplayDryRun; + String? lastReplayTaskId; + String? lastRevokeTaskId; + bool replayTaskSuccess = true; + bool revokeTaskSuccess = true; DeadLetterReplayResult replayResult = const DeadLetterReplayResult( entries: [], dryRun: false, @@ -43,6 +51,11 @@ class _FakeDashboardService implements DashboardDataSource { _workers = List.unmodifiable(values); } + List get taskStatuses => _taskStatuses; + set taskStatuses(List values) { + _taskStatuses = List.unmodifiable(values); + } + @override Future> fetchQueueSummaries() async => List.from(_queues); @@ -51,6 +64,51 @@ class _FakeDashboardService implements DashboardDataSource { Future> fetchWorkerStatuses() async => List.from(_workers); + @override + Future> fetchTaskStatuses({ + TaskState? state, + String? queue, + int limit = 100, + int offset = 0, + }) async { + final matches = _taskStatuses.where((entry) { + if (state != null && entry.state != state) return false; + if (queue != null && queue.isNotEmpty && entry.queue != queue) { + return false; + } + return true; + }); + return matches.skip(offset).take(limit).toList(growable: false); + } + + @override + Future fetchTaskStatus(String taskId) async { + for (final entry in _taskStatuses) { + if (entry.id == taskId) { + return entry; + } + } + return null; + } + + @override + Future> fetchTaskStatusesForRun( + String runId, { + int limit = 200, + }) async { + final matches = _taskStatuses.where((entry) => entry.runId == runId); + return matches.take(limit).toList(growable: false); + } + + @override + Future fetchWorkflowRun(String runId) async => + null; + + @override + Future> fetchWorkflowSteps( + String runId, + ) async => const []; + @override Future enqueueTask(EnqueueRequest request) async { lastEnqueue = request; @@ -68,6 +126,25 @@ class _FakeDashboardService implements DashboardDataSource { return replayResult; } + @override + Future replayTaskById(String taskId, {String? queue}) async { + lastReplayTaskId = taskId; + if (queue != null && queue.isNotEmpty) { + lastReplayQueue = queue; + } + return replayTaskSuccess; + } + + @override + Future revokeTask( + String taskId, { + bool terminate = false, + String? reason, + }) async { + lastRevokeTaskId = taskId; + return revokeTaskSuccess; + } + @override Future> sendControlCommand( ControlCommandMessage command, { @@ -86,6 +163,10 @@ class _FakeDashboardService implements DashboardDataSource { lastReplayQueue = null; lastReplayLimit = null; lastReplayDryRun = null; + lastReplayTaskId = null; + lastRevokeTaskId = null; + replayTaskSuccess = true; + revokeTaskSuccess = true; replayResult = const DeadLetterReplayResult( entries: [], dryRun: false, @@ -283,4 +364,14 @@ return fetch('/queues/replay', { expect(service.lastReplayLimit, 5); expect(service.lastReplayDryRun, isFalse); }); + + _dashboardBrowserTest('search page renders saved views and query results', ( + browser, + ) async { + await browser.visit('/search?q=default&scope=all'); + await browser.waiter.waitFor('.table-card'); + await browser.assertSee('Search'); + await browser.assertSee('Saved Views'); + await browser.assertSee('Backlog hotspots'); + }); } diff --git a/packages/dashboard/test/dashboard_state_poll_test.dart b/packages/dashboard/test/dashboard_state_poll_test.dart new file mode 100644 index 00000000..6482ced1 --- /dev/null +++ b/packages/dashboard/test/dashboard_state_poll_test.dart @@ -0,0 +1,209 @@ +import 'dart:async'; +import 'dart:io'; + +import 'package:stem/stem.dart' + show DeadLetterEntry, DeadLetterReplayResult, TaskState; +import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/services/stem_service.dart'; +import 'package:stem_dashboard/src/state/dashboard_state.dart'; +import 'package:stem_dashboard/src/stem/control_messages.dart'; +import 'package:test/test.dart'; + +class _FailingPollService implements DashboardDataSource { + @override + Future> fetchQueueSummaries() async { + throw StateError('queue failed'); + } + + @override + Future> fetchWorkerStatuses() async { + await Future.delayed(const Duration(milliseconds: 10)); + throw StateError('worker failed'); + } + + @override + Future> fetchTaskStatuses({ + TaskState? state, + String? queue, + int limit = 100, + int offset = 0, + }) async { + await Future.delayed(const Duration(milliseconds: 20)); + throw StateError('tasks failed'); + } + + @override + Future fetchTaskStatus(String taskId) async => + null; + + @override + Future> fetchTaskStatusesForRun( + String runId, { + int limit = 200, + }) async => const []; + + @override + Future fetchWorkflowRun(String runId) async => + null; + + @override + Future> fetchWorkflowSteps( + String runId, + ) async => const []; + + @override + Future enqueueTask(EnqueueRequest request) async {} + + @override + Future replayDeadLetters( + String queue, { + int limit = 50, + bool dryRun = false, + }) async => + const DeadLetterReplayResult(entries: [], dryRun: false); + + @override + Future replayTaskById(String taskId, {String? queue}) async => false; + + @override + Future revokeTask( + String taskId, { + bool terminate = false, + String? reason, + }) async => false; + + @override + Future> sendControlCommand( + ControlCommandMessage command, { + Duration timeout = const Duration(seconds: 5), + }) async => const []; + + @override + Future close() async {} +} + +class _BacklogOnlyService implements DashboardDataSource { + @override + Future> fetchQueueSummaries() async => const [ + QueueSummary(queue: 'default', pending: 999, inflight: 0, deadLetters: 0), + ]; + + @override + Future> fetchWorkerStatuses() async => const []; + + @override + Future> fetchTaskStatuses({ + TaskState? state, + String? queue, + int limit = 100, + int offset = 0, + }) async => const []; + + @override + Future fetchTaskStatus(String taskId) async => + null; + + @override + Future> fetchTaskStatusesForRun( + String runId, { + int limit = 200, + }) async => const []; + + @override + Future fetchWorkflowRun(String runId) async => + null; + + @override + Future> fetchWorkflowSteps( + String runId, + ) async => const []; + + @override + Future enqueueTask(EnqueueRequest request) async {} + + @override + Future replayDeadLetters( + String queue, { + int limit = 50, + bool dryRun = false, + }) async => + const DeadLetterReplayResult(entries: [], dryRun: false); + + @override + Future replayTaskById(String taskId, {String? queue}) async => false; + + @override + Future revokeTask( + String taskId, { + bool terminate = false, + String? reason, + }) async => false; + + @override + Future> sendControlCommand( + ControlCommandMessage command, { + Duration timeout = const Duration(seconds: 5), + }) async => const []; + + @override + Future close() async {} +} + +void main() { + test( + 'runOnce does not leak uncaught async errors when one poll call fails', + () async { + final uncaught = []; + + await runZonedGuarded( + () async { + final state = DashboardState( + service: _FailingPollService(), + pollInterval: const Duration(hours: 1), + ); + + await expectLater(state.runOnce(), throwsA(isA())); + await Future.delayed(const Duration(milliseconds: 60)); + await state.dispose(); + }, + (error, stackTrace) { + uncaught.add(error); + }, + ); + + expect(uncaught, isEmpty); + }, + ); + + test('alert webhook delivery times out and polling continues', () async { + final server = await HttpServer.bind(InternetAddress.loopbackIPv4, 0); + addTearDown(() => server.close(force: true)); + server.listen((_) { + // Intentionally keep responses open to simulate a hanging endpoint. + }); + + final state = DashboardState( + service: _BacklogOnlyService(), + pollInterval: const Duration(hours: 1), + alertWebhookUrls: ['http://127.0.0.1:${server.port}/alerts'], + alertBacklogThreshold: 1, + ); + + final watch = Stopwatch()..start(); + await state.runOnce(); + watch.stop(); + + expect(watch.elapsed, lessThan(const Duration(seconds: 7))); + expect( + state.auditEntries.any( + (entry) => + entry.kind == 'alert' && + entry.status == 'error' && + (entry.summary?.contains('timed out') ?? false), + ), + isTrue, + ); + + await state.dispose(); + }); +} diff --git a/packages/dashboard/test/dashboard_state_property_test.dart b/packages/dashboard/test/dashboard_state_property_test.dart index e537d2ae..405079c0 100644 --- a/packages/dashboard/test/dashboard_state_property_test.dart +++ b/packages/dashboard/test/dashboard_state_property_test.dart @@ -1,5 +1,6 @@ import 'package:property_testing/property_testing.dart'; -import 'package:stem/stem.dart' show DeadLetterEntry, DeadLetterReplayResult; +import 'package:stem/stem.dart' + show DeadLetterEntry, DeadLetterReplayResult, TaskState; import 'package:stem_dashboard/src/services/models.dart'; import 'package:stem_dashboard/src/services/stem_service.dart'; import 'package:stem_dashboard/src/state/dashboard_state.dart'; @@ -76,6 +77,33 @@ class _SequenceDashboardService implements DashboardDataSource { return _workerSnapshots[_workerIndex++]; } + @override + Future> fetchTaskStatuses({ + TaskState? state, + String? queue, + int limit = 100, + int offset = 0, + }) async => const []; + + @override + Future fetchTaskStatus(String taskId) async => + null; + + @override + Future> fetchTaskStatusesForRun( + String runId, { + int limit = 200, + }) async => const []; + + @override + Future fetchWorkflowRun(String runId) async => + null; + + @override + Future> fetchWorkflowSteps( + String runId, + ) async => const []; + @override Future enqueueTask(EnqueueRequest request) async {} @@ -87,6 +115,16 @@ class _SequenceDashboardService implements DashboardDataSource { }) async => const DeadLetterReplayResult(entries: [], dryRun: false); + @override + Future replayTaskById(String taskId, {String? queue}) async => false; + + @override + Future revokeTask( + String taskId, { + bool terminate = false, + String? reason, + }) async => false; + @override Future> sendControlCommand( ControlCommandMessage command, { diff --git a/packages/dashboard/test/server_test.dart b/packages/dashboard/test/server_test.dart index 165126c2..a0f13451 100644 --- a/packages/dashboard/test/server_test.dart +++ b/packages/dashboard/test/server_test.dart @@ -1,7 +1,7 @@ import 'package:routed_testing/routed_testing.dart'; import 'package:server_testing/server_testing.dart'; import 'package:stem/stem.dart' - show DeadLetterEntry, DeadLetterReplayResult, Envelope; + show DeadLetterEntry, DeadLetterReplayResult, Envelope, TaskState; import 'package:stem_dashboard/src/server.dart'; import 'package:stem_dashboard/src/services/models.dart'; import 'package:stem_dashboard/src/services/stem_service.dart'; @@ -9,10 +9,15 @@ import 'package:stem_dashboard/src/state/dashboard_state.dart'; import 'package:stem_dashboard/src/stem/control_messages.dart'; class _RecordingService implements DashboardDataSource { - _RecordingService({this.queues = const [], this.workers = const []}); + _RecordingService({ + this.queues = const [], + this.workers = const [], + this.taskStatuses = const [], + }); final List queues; final List workers; + final List taskStatuses; EnqueueRequest? lastEnqueue; final List controlCommands = []; @@ -20,6 +25,12 @@ class _RecordingService implements DashboardDataSource { String? lastReplayQueue; int? lastReplayLimit; bool? lastReplayDryRun; + String? lastReplayTaskId; + String? lastRevokeTaskId; + bool? lastRevokeTerminate; + String? lastRevokeReason; + bool replayTaskSuccess = true; + bool revokeTaskSuccess = true; DeadLetterReplayResult replayResult = const DeadLetterReplayResult( entries: [], dryRun: false, @@ -31,6 +42,61 @@ class _RecordingService implements DashboardDataSource { @override Future> fetchWorkerStatuses() async => workers; + @override + Future> fetchTaskStatuses({ + TaskState? state, + String? queue, + int limit = 100, + int offset = 0, + }) async { + final filtered = taskStatuses + .where((entry) { + if (state != null && entry.state != state) { + return false; + } + if (queue != null && + queue.trim().isNotEmpty && + entry.queue != queue) { + return false; + } + return true; + }) + .skip(offset) + .take(limit); + return filtered.toList(growable: false); + } + + @override + Future fetchTaskStatus(String taskId) async { + for (final entry in taskStatuses) { + if (entry.id == taskId) { + return entry; + } + } + return null; + } + + @override + Future> fetchTaskStatusesForRun( + String runId, { + int limit = 200, + }) async { + final filtered = taskStatuses + .where((entry) => entry.runId == runId) + .take(limit) + .toList(growable: false); + return filtered; + } + + @override + Future fetchWorkflowRun(String runId) async => + null; + + @override + Future> fetchWorkflowSteps( + String runId, + ) async => const []; + @override Future enqueueTask(EnqueueRequest request) async { lastEnqueue = request; @@ -48,6 +114,27 @@ class _RecordingService implements DashboardDataSource { return replayResult; } + @override + Future replayTaskById(String taskId, {String? queue}) async { + lastReplayTaskId = taskId; + if (queue != null && queue.isNotEmpty) { + lastReplayQueue = queue; + } + return replayTaskSuccess; + } + + @override + Future revokeTask( + String taskId, { + bool terminate = false, + String? reason, + }) async { + lastRevokeTaskId = taskId; + lastRevokeTerminate = terminate; + lastRevokeReason = reason; + return revokeTaskSuccess; + } + @override Future> sendControlCommand( ControlCommandMessage command, { @@ -63,10 +150,15 @@ class _RecordingService implements DashboardDataSource { Future _buildClient( _RecordingService service, - DashboardState state, -) async { + DashboardState state, { + String basePath = '', +}) async { await state.runOnce(); - final engine = buildDashboardEngine(service: service, state: state); + final engine = buildDashboardEngine( + service: service, + state: state, + basePath: basePath, + ); final handler = RoutedRequestHandler(engine, true); addTearDown(() async { await handler.close(); @@ -100,6 +192,58 @@ void main() { ..assertBodyContains('critical'); }); + test('GET /partials/overview renders turbo stream section updates', () async { + final service = _RecordingService( + queues: const [ + QueueSummary(queue: 'default', pending: 2, inflight: 1, deadLetters: 0), + ], + workers: [ + WorkerStatus( + workerId: 'worker-1', + namespace: 'stem', + timestamp: DateTime.utc(2026), + inflight: 1, + isolateCount: 2, + queues: const [], + ), + ], + taskStatuses: [ + DashboardTaskStatusEntry( + id: 'task-1', + state: TaskState.running, + attempt: 0, + createdAt: DateTime.utc(2026), + updatedAt: DateTime.utc(2026, 1, 1, 0, 1), + queue: 'default', + taskName: 'demo.run', + ), + ], + ); + final state = DashboardState(service: service); + final client = await _buildClient(service, state); + + final response = await client.get( + '/partials/overview', + headers: { + 'accept': ['text/vnd.turbo-stream.html'], + }, + ); + response + ..assertStatus(200) + ..assertBodyContains( + '.generate(55, (index) { + final position = index + 1; + return DashboardTaskStatusEntry( + id: 'task-$position', + state: TaskState.succeeded, + attempt: 0, + createdAt: DateTime.utc(2026), + updatedAt: DateTime.utc(2026, 1, 1, 0, position), + queue: 'alpha', + taskName: 'demo.task.$position', + ); + }); + final service = _RecordingService( + queues: const [ + QueueSummary(queue: 'alpha', pending: 0, inflight: 0, deadLetters: 0), + ], + taskStatuses: statuses, + ); + final state = DashboardState(service: service); + final client = await _buildClient(service, state); + + final response = await client.get('/tasks?page=2&pageSize=25'); + response + ..assertStatus(200) + ..assertBodyContains('Page 2') + ..assertBodyContains('task-26') + ..assertBodyContains('Previous') + ..assertBodyContains('Next'); + }, + ); + + test('GET /tasks applies namespace/task/run filters', () async { + final service = _RecordingService( + queues: const [ + QueueSummary(queue: 'alpha', pending: 0, inflight: 0, deadLetters: 0), + ], + taskStatuses: [ + DashboardTaskStatusEntry( + id: 'task-a', + state: TaskState.running, + attempt: 0, + createdAt: DateTime.utc(2026), + updatedAt: DateTime.utc(2026, 1, 1, 0, 1), + queue: 'alpha', + taskName: 'greeting.send', + runId: 'run-1', + meta: const {'namespace': 'stem'}, + ), + DashboardTaskStatusEntry( + id: 'task-b', + state: TaskState.running, + attempt: 0, + createdAt: DateTime.utc(2026), + updatedAt: DateTime.utc(2026, 1, 1, 0, 2), + queue: 'alpha', + taskName: 'greeting.send', + runId: 'run-2', + meta: const {'namespace': 'tenant-a'}, + ), + ], + ); + final state = DashboardState(service: service); + final client = await _buildClient(service, state); + + final response = await client.get( + '/tasks?namespace=tenant-a&task=greeting&runId=run-2', + ); + response + ..assertStatus(200) + ..assertBodyContains('task-b'); + expect(response.body, isNot(contains('task-a'))); + }); + + test('GET /namespaces renders namespace rollup table', () async { + final service = _RecordingService( + queues: const [ + QueueSummary(queue: 'alpha', pending: 2, inflight: 1, deadLetters: 0), + ], + workers: [ + WorkerStatus( + workerId: 'worker-1', + namespace: 'stem', + timestamp: DateTime.utc(2026), + isolateCount: 2, + inflight: 1, + queues: const [WorkerQueueInfo(name: 'alpha', inflight: 1)], + ), + ], + taskStatuses: [ + DashboardTaskStatusEntry( + id: 'task-ns-1', + state: TaskState.running, + attempt: 0, + createdAt: DateTime.utc(2026), + updatedAt: DateTime.utc(2026, 1, 1, 0, 1), + queue: 'alpha', + taskName: 'demo.task', + meta: const {'namespace': 'stem'}, + ), + ], + ); + final state = DashboardState(service: service); + final client = await _buildClient(service, state); + + final response = await client.get('/namespaces'); + response + ..assertStatus(200) + ..assertBodyContains('Namespaces') + ..assertBodyContains('Namespace Summary') + ..assertBodyContains('stem'); + }); + + test('GET /workflows renders workflow run summaries', () async { + final service = _RecordingService( + taskStatuses: [ + DashboardTaskStatusEntry( + id: 'task-wf-1', + state: TaskState.running, + attempt: 0, + createdAt: DateTime.utc(2026), + updatedAt: DateTime.utc(2026, 1, 1, 0, 1), + queue: 'alpha', + taskName: 'workflow.step', + runId: 'run-xyz', + workflowName: 'greetingFlow', + workflowStep: 'stepA', + ), + ], + ); + final state = DashboardState(service: service); + final client = await _buildClient(service, state); + + final response = await client.get('/workflows'); + response + ..assertStatus(200) + ..assertBodyContains('Workflow Runs') + ..assertBodyContains('run-xyz') + ..assertBodyContains('greetingFlow'); + }); + + test('GET /jobs renders job family summary', () async { + final service = _RecordingService( + taskStatuses: [ + DashboardTaskStatusEntry( + id: 'task-job-1', + state: TaskState.failed, + attempt: 1, + createdAt: DateTime.utc(2026), + updatedAt: DateTime.utc(2026, 1, 1, 0, 1), + queue: 'alpha', + taskName: 'greeting.send', + errorMessage: 'boom', + ), + ], + ); + final state = DashboardState(service: service); + final client = await _buildClient(service, state); + + final response = await client.get('/jobs'); + response + ..assertStatus(200) + ..assertBodyContains('Job Summary') + ..assertBodyContains('greeting.send'); + }); + + test('GET /tasks/inline renders lazy task panel as turbo stream', () async { + final service = _RecordingService( + taskStatuses: [ + DashboardTaskStatusEntry( + id: 'task-inline-1', + state: TaskState.failed, + attempt: 1, + createdAt: DateTime.utc(2026), + updatedAt: DateTime.utc(2026, 1, 1, 0, 1), + queue: 'alpha', + taskName: 'demo.inline', + errorMessage: 'boom', + meta: const {'stem.task': 'demo.inline'}, + ), + ], + ); + final state = DashboardState(service: service); + final client = await _buildClient(service, state); + + final response = await client.get( + '/tasks/inline?id=task-inline-1&target=task-inline-task-inline-1', + headers: { + 'accept': ['text/vnd.turbo-stream.html'], + }, + ); + response + ..assertStatus(200) + ..assertBodyContains(' entry.id == 'task-failed'); + expect(failed.queue, 'critical'); + expect(failed.taskName, 'demo.fail'); + expect(failed.state, TaskState.failed); + expect(failed.errorMessage, 'boom'); + + final stemMeta = all.firstWhere((entry) => entry.id == 'task-stem-meta'); + expect(stemMeta.queue, 'stem-only'); + expect(stemMeta.taskName, 'demo.stem.meta'); + expect(stemMeta.state, TaskState.running); + + final failedOnly = await service.fetchTaskStatuses(state: TaskState.failed); + expect(failedOnly, hasLength(1)); + expect(failedOnly.first.id, 'task-failed'); + + final queueOnly = await service.fetchTaskStatuses(queue: 'default'); + expect(queueOnly, hasLength(1)); + expect(queueOnly.first.id, 'task-ok'); + + final detail = await service.fetchTaskStatus('task-failed'); + expect(detail, isNotNull); + expect(detail!.errorType, 'StateError'); + expect(detail.errorMessage, 'boom'); + expect(detail.runId, 'run-1'); + + final runStatuses = await service.fetchTaskStatusesForRun('run-1'); + expect(runStatuses.length, 3); + }); } diff --git a/packages/dashboard/test/ui_escape_test.dart b/packages/dashboard/test/ui_escape_test.dart new file mode 100644 index 00000000..edc3c22f --- /dev/null +++ b/packages/dashboard/test/ui_escape_test.dart @@ -0,0 +1,74 @@ +import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/event_templates.dart'; +import 'package:stem_dashboard/src/ui/options.dart'; +import 'package:stem_dashboard/src/ui/shared.dart'; +import 'package:stem_dashboard/src/ui/workers.dart'; +import 'package:test/test.dart'; + +void main() { + test('buildWorkerRow escapes worker and queue values', () { + final html = buildWorkerRow( + WorkerStatus( + workerId: 'worker', + namespace: 'stem', + timestamp: DateTime.utc(2026), + isolateCount: 2, + inflight: 1, + queues: const [ + WorkerQueueInfo(name: 'queue" onclick="evil()', inflight: 1), + ], + ), + ); + + expect(html, contains('worker<script>alert(1)</script>')); + expect(html, contains('queue" onclick="evil()')); + expect(html, isNot(contains('worker'))); + }); + + test('buildWorkersContent escapes namespace filter in empty state', () { + final html = buildWorkersContent( + const [], + const [], + const WorkersPageOptions(namespaceFilter: ''), + ); + + expect(html, contains('<svg/onload=alert(1)>')); + expect(html, isNot(contains(''))); + }); + + test('buildQueueTableRow escapes queue in content and attributes', () { + final html = buildQueueTableRow( + const QueueSummary( + queue: 'alpha" data-pwn="1', + pending: 1, + inflight: 0, + deadLetters: 0, + ), + ); + + expect(html, contains('data-queue-row="alpha" data-pwn="1"')); + expect( + html, + contains('alpha" data-pwn="1'), + ); + expect(html, isNot(contains('data-queue-row="alpha" data-pwn="1"'))); + }); + + test('renderEventItem escapes title, summary, and metadata values', () { + final html = renderEventItem( + DashboardEvent( + title: 'event', + timestamp: DateTime.utc(2026), + summary: '', + metadata: const { + 'queue': '', + }, + ), + ); + + expect(html, contains('<b>event</b>')); + expect(html, contains('<script>alert(1)</script>')); + expect(html, contains('<img src=x onerror=alert(1)>')); + expect(html, isNot(contains(''))); + }); +} diff --git a/packages/dashboard/test/ui_paths_test.dart b/packages/dashboard/test/ui_paths_test.dart new file mode 100644 index 00000000..5fde2eaf --- /dev/null +++ b/packages/dashboard/test/ui_paths_test.dart @@ -0,0 +1,31 @@ +import 'package:stem_dashboard/src/ui/paths.dart'; +import 'package:test/test.dart'; + +void main() { + test( + 'prefixes root-relative url attributes while preserving quote style', + () { + const html = + 'Tasks ' + "
" + ''; + + final prefixed = prefixDashboardUrlAttributes(html, '/dashboard'); + + expect(prefixed, contains('href="/dashboard/tasks"')); + expect(prefixed, contains("action='/dashboard/workers'")); + expect(prefixed, contains('value="/dashboard/search?q=alpha"')); + }, + ); + + test('does not rewrite protocol-relative urls', () { + const html = + 'CDN ' + '
'; + + final prefixed = prefixDashboardUrlAttributes(html, '/dashboard'); + + expect(prefixed, contains('href="//cdn.example.com/app.js"')); + expect(prefixed, contains('action="/dashboard/workers"')); + }); +} diff --git a/packages/dashboard/web/tailwind.input.css b/packages/dashboard/web/tailwind.input.css new file mode 100644 index 00000000..391add3b --- /dev/null +++ b/packages/dashboard/web/tailwind.input.css @@ -0,0 +1,450 @@ +@tailwind base; +@tailwind components; +@tailwind utilities; + + @layer base { + *, + *::before, + *::after { + @apply box-border; + } + + html { + color-scheme: dark; + } + + body { + @apply m-0 bg-stem-950 font-sans text-slate-200 antialiased; + } + + a { + @apply text-inherit no-underline; + } + + h1 { + @apply m-0 text-3xl font-semibold tracking-tight; + } + + table { + @apply w-full border-collapse; + } + + thead { + @apply bg-slate-800/85 text-xs uppercase tracking-widest text-slate-400; + } + + th, + td { + @apply px-4 py-3.5 text-left align-top; + } + + tbody tr { + @apply border-b border-slate-400/10 transition-colors duration-150; + } + + tbody tr:last-child { + @apply border-b-0; + } + + tbody tr:hover { + @apply bg-blue-900/25; + } + + turbo-frame#dashboard-content { + @apply block flex-1; + } + } + + @layer components { + .app-shell { + @apply relative flex min-h-screen gap-4 p-3 lg:gap-6 lg:p-6; + } + + .app-shell::before { + content: ''; + @apply pointer-events-none fixed inset-0; + background: + radial-gradient(900px 420px at 12% -8%, rgba(56, 189, 248, 0.18), transparent 70%), + radial-gradient(760px 380px at 102% 18%, rgba(14, 116, 144, 0.16), transparent 64%), + linear-gradient(180deg, #0b1220 0%, #0f172a 48%, #0c1324 100%); + } + + .app-shell > * { + @apply relative z-10; + } + + .sidebar-backdrop { + @apply pointer-events-none fixed inset-0 z-30 bg-stem-950/70 opacity-0 backdrop-blur-sm transition duration-200 lg:hidden; + } + + .sidebar-backdrop[data-open='true'] { + @apply pointer-events-auto opacity-100; + } + + .sidebar { + @apply fixed inset-y-3 left-3 z-40 flex -translate-x-full flex-col gap-6 rounded-3xl border border-sky-200/20 bg-gradient-to-b from-slate-900/95 via-blue-950/90 to-cyan-900/80 px-4 py-5 shadow-2xl shadow-sky-950/40 transition duration-300 lg:sticky lg:top-6 lg:w-72 lg:translate-x-0; + width: min(18rem, calc(100vw - 1.5rem)); + height: calc(100vh - 1.5rem); + } + + .sidebar[data-open='true'] { + @apply translate-x-0; + } + + .sidebar-head { + @apply flex items-center justify-between gap-3; + } + + .sidebar-close { + @apply inline-flex h-10 w-10 items-center justify-center rounded-xl border border-slate-300/20 bg-slate-900/60 text-slate-300 transition hover:border-sky-300/45 hover:text-sky-200 lg:hidden; + } + + .brand-panel { + @apply rounded-2xl border border-sky-300/25 bg-sky-400/10 px-3.5 py-3; + } + + .brand { + @apply text-sm font-semibold uppercase tracking-widest text-sky-200; + } + + .brand-tagline { + @apply mt-1 text-xs text-slate-400; + } + + .sidebar-status { + @apply mt-4 flex items-center gap-2 text-xs uppercase tracking-wider text-emerald-200; + } + + .status-dot { + @apply h-2.5 w-2.5 rounded-full bg-emerald-300 ring-4 ring-emerald-300/20; + } + + nav { + @apply flex flex-col gap-2 rounded-2xl border border-slate-300/15 bg-slate-900/40 p-2; + } + + .nav-link { + @apply flex items-center gap-3 rounded-xl px-3.5 py-2.5 font-medium text-slate-300/80 transition duration-150; + } + + .nav-link::before { + content: ''; + @apply h-2 w-2 rounded-full bg-slate-400/30 transition duration-150; + } + + .nav-link:hover { + @apply translate-x-0.5 bg-sky-300/10 text-slate-100; + } + + .nav-link.active { + @apply bg-gradient-to-r from-sky-300/25 to-cyan-300/20 text-slate-50; + } + + .nav-link.active::before { + @apply bg-sky-300; + } + + .sidebar-footer { + @apply mt-auto rounded-2xl border border-slate-300/15 bg-slate-900/45 px-3.5 py-3 text-xs leading-relaxed text-slate-300/70; + } + + .main { + @apply relative flex flex-1 flex-col rounded-3xl border border-slate-400/15 bg-slate-900/70 p-4 shadow-2xl backdrop-blur xl:p-6; + } + + .top-panel { + @apply mb-5 flex flex-wrap items-center justify-between gap-4 rounded-2xl border border-sky-300/20 bg-gradient-to-r from-slate-900/85 via-slate-900/70 to-cyan-950/60 px-4 py-4; + } + + .top-panel-left { + @apply flex min-w-0 items-center gap-3; + } + + .sidebar-toggle { + @apply inline-flex h-10 w-10 items-center justify-center rounded-xl border border-slate-300/20 bg-slate-900/60 text-slate-200 transition hover:border-sky-300/45 hover:text-sky-200 lg:hidden; + } + + .panel-eyebrow { + @apply text-xs uppercase tracking-wider text-slate-400; + } + + .panel-title { + @apply truncate text-xl font-semibold tracking-tight text-slate-100 sm:text-2xl; + } + + .top-panel-right { + @apply flex flex-wrap items-center gap-2 sm:gap-3; + } + + .status-pill { + @apply inline-flex items-center gap-2 rounded-full border border-emerald-300/35 bg-emerald-300/10 px-3 py-1 text-xs font-semibold uppercase tracking-wider text-emerald-100; + } + + .status-pill-dot { + @apply h-2 w-2 rounded-full bg-emerald-300; + } + + .quick-link { + @apply inline-flex items-center rounded-full border border-sky-300/30 bg-sky-400/12 px-3 py-1.5 text-xs font-semibold uppercase tracking-wide text-sky-200 transition hover:border-sky-200/50 hover:bg-sky-300/20; + } + + .content-shell { + @apply flex min-h-0 flex-1; + } + + .page-header { + @apply mb-7; + } + + .page-subtitle { + @apply mt-3 text-sm text-slate-400; + } + + .cards { + @apply mb-8 grid grid-cols-1 gap-5 sm:grid-cols-2 xl:grid-cols-3 2xl:grid-cols-4; + } + + .card { + @apply rounded-2xl border border-slate-400/15 bg-gradient-to-br from-stem-900/95 to-stem-950/80 p-5 shadow-xl; + } + + .card-title { + @apply mb-3 text-xs font-semibold uppercase tracking-wide text-slate-400; + } + + .card-value { + @apply text-3xl font-semibold; + } + + .card-caption { + @apply mt-2.5 text-sm text-slate-400/90; + } + + .table-card { + @apply overflow-hidden rounded-2xl border border-slate-400/15 bg-stem-950/85; + } + + .filter-form { + @apply my-6 flex flex-wrap items-center gap-3; + } + + .filter-form input[type="text"], + .filter-form select { + @apply min-w-40 rounded-xl border border-slate-400/20 bg-slate-800/75 px-3.5 py-2.5 text-slate-100; + } + + .filter-form input[type="text"] { + @apply min-w-52; + } + + .filter-form button, + .enqueue-form button { + @apply cursor-pointer rounded-xl bg-sky-400 px-4 py-2.5 font-semibold text-stem-950 transition duration-150 hover:bg-sky-300; + } + + .filter-label { + @apply text-xs uppercase tracking-widest text-slate-400; + } + + .clear-filter { + @apply text-sm text-sky-300; + } + + .sort-link { + @apply font-semibold text-slate-400; + } + + .sort-link:hover { + @apply text-sky-300; + } + + .sort-link.active { + @apply text-slate-100; + } + + .queue-row, + .task-row { + @apply cursor-pointer; + } + + .queue-row:hover { + @apply bg-sky-400/10; + } + + .queue-detail, + .task-detail { + @apply hidden bg-stem-950/80; + } + + .queue-detail.visible, + .task-detail.visible { + @apply table-row; + } + + .task-detail-cell { + @apply p-4; + } + + .detail-grid { + @apply grid grid-cols-1 gap-3 md:grid-cols-2 xl:grid-cols-3; + } + + .detail-grid div { + @apply rounded-xl border border-slate-400/15 bg-slate-800/60 p-3; + } + + .meta-list { + @apply grid grid-cols-1 gap-2 lg:grid-cols-2; + } + + .meta-item { + @apply flex min-w-0 flex-col gap-1 rounded-lg border border-slate-400/15 bg-slate-800/60 p-2.5; + } + + .payload-block { + @apply mt-2.5 whitespace-pre-wrap break-words rounded-lg border border-slate-400/20 bg-slate-950/70 px-3 py-2.5 font-mono text-xs leading-relaxed text-slate-300; + } + + .flash { + @apply mb-5 rounded-xl px-4 py-3.5 font-semibold; + } + + .flash.success { + @apply border border-emerald-400/35 bg-emerald-400/15 text-emerald-200; + } + + .flash.error { + @apply border border-red-400/35 bg-red-400/15 text-red-200; + } + + .enqueue-card { + @apply rounded-2xl border border-slate-400/15 bg-stem-950/85 p-6; + } + + .enqueue-form { + @apply flex flex-col gap-4; + } + + .form-grid { + @apply grid grid-cols-1 gap-4 md:grid-cols-2; + } + + .form-grid label { + @apply flex flex-col gap-2 text-sm text-slate-400; + } + + .form-grid input, + .form-grid textarea { + @apply rounded-xl border border-slate-400/20 bg-slate-800/75 px-3 py-2.5 text-slate-100; + } + + .payload-label textarea { + @apply min-h-32 resize-y; + } + + .enqueue-form button { + @apply self-start px-4 py-3; + } + + .muted { + @apply text-slate-400; + } + + .pill { + @apply inline-flex items-center gap-1.5 rounded-full bg-sky-400/15 px-3 py-1.5 text-xs text-sky-300; + } + + .pill.success { + @apply bg-emerald-400/15 text-emerald-200; + } + + .pill.error { + @apply bg-red-400/15 text-red-200; + } + + .pill.warning { + @apply bg-amber-400/20 text-amber-200; + } + + .pill.running { + @apply bg-blue-400/20 text-blue-200; + } + + .pill.muted { + @apply bg-slate-400/20 text-slate-300; + } + + .event-feed { + @apply grid gap-4; + } + + .control-panel { + @apply mt-7 rounded-2xl border border-slate-400/15 bg-stem-950/80 p-6; + } + + .section-heading { + @apply mb-4 text-lg font-semibold tracking-tight; + } + + .action-bar { + @apply flex flex-wrap gap-3; + } + + .inline-form { + @apply m-0 inline-flex items-center; + } + + .ghost-button { + @apply cursor-pointer rounded-xl border border-sky-300/30 bg-sky-400/15 px-3.5 py-2 font-semibold text-sky-300 transition duration-150 hover:bg-sky-400/25; + } + + .ghost-button.disabled { + @apply pointer-events-none cursor-default opacity-45; + } + + .pager { + @apply flex items-center justify-between gap-3 border-b border-slate-400/10 px-4 py-3.5; + } + + .event-item { + @apply overflow-hidden rounded-2xl border border-slate-400/15 bg-stem-950/85; + } + + .event-item summary { + @apply flex cursor-pointer list-none items-center justify-between px-4 py-4 font-semibold text-slate-100; + } + + .event-item summary::-webkit-details-marker { + display: none; + } + + .event-item[open] summary { + @apply bg-sky-400/10; + } + + .event-title { + @apply text-base; + } + + .event-time { + @apply text-sm text-slate-400; + } + + .event-item > *:not(summary) { + @apply px-4 pb-4; + } + + .event-meta { + @apply mt-2.5 flex flex-wrap gap-3 text-sm text-slate-400; + } + + .error-preview { + @apply inline-block max-w-[30rem] overflow-hidden text-ellipsis whitespace-nowrap align-top; + } + } + + @media (min-width: 1024px) { + .sidebar { + width: 18rem; + } + } diff --git a/packages/stem/CHANGELOG.md b/packages/stem/CHANGELOG.md index f3e05269..f9620a28 100644 --- a/packages/stem/CHANGELOG.md +++ b/packages/stem/CHANGELOG.md @@ -2,6 +2,18 @@ ## 0.1.1 +- Expanded span attribution across enqueue/consume/execute with task identity, + queue, worker, host, lineage, namespace, and workflow step metadata + (`run_id`, `step`, `step_id`, `step_index`, `step_attempt`, `iteration`). +- Improved worker retry republish behavior to preserve optional payload signing + when retrying deliveries. +- Added workflow metadata quality-of-life getters and watcher/run-state helpers + to make workflow introspection easier from task metadata. +- Strengthened tracing and workflow-related test coverage for metadata + propagation and contract behavior. +- Expanded the microservice example with richer workload generation, queue + diversity, updated scheduler/demo flows, and full local observability wiring + for Jaeger/Prometheus/Grafana through nginx. - Improved bootstrap DX with explicit fail-fast errors across broker/backend/ workflow/schedule/lock/revoke resolution paths in `StemStack.fromUrl`, including actionable hints when adapters support a URL but do not implement diff --git a/packages/stem/example/microservice/README.md b/packages/stem/example/microservice/README.md index 73b104f1..9dde5fc1 100644 --- a/packages/stem/example/microservice/README.md +++ b/packages/stem/example/microservice/README.md @@ -22,6 +22,10 @@ All services expect the following environment variables (see `.env.example` for | `STEM_TLS_CLIENT_CERT` | _(optional)_ | mTLS client certificate used by enqueuers/workers. | | `STEM_TLS_CLIENT_KEY` | _(optional)_ | Private key associated with the client certificate. | | `PORT` | `8081` | HTTP port for the enqueue API (nginx fronts it on `api.localhost:8080`). | +| `ENQUEUER_AUTOFILL_ENABLED` | `true` | Enables a background demo producer that keeps the dashboard populated. | +| `ENQUEUER_AUTOFILL_INTERVAL_MS` | `2500` | Interval between auto-fill publish cycles. | +| `ENQUEUER_AUTOFILL_BATCH_SIZE` | `2` | Number of successful tasks published per auto-fill cycle. | +| `ENQUEUER_AUTOFILL_FAILURE_EVERY` | `8` | Every Nth cycle enqueues one synthetic failing task. | | `STEM_SCHEDULE_FILE` | `/config/schedules.yaml` | Optional YAML file the beat service uses to seed schedules. | | `STEM_METRIC_EXPORTERS` | `otlp:http://otel-collector:4318/v1/metrics` | Comma-separated list of metrics exporters enabled for workers (OTLP by default). | | `STEM_OTLP_ENDPOINT` | `http://otel-collector:4318/v1/traces` | Default OTLP endpoint used when exporters do not specify a destination. | @@ -58,7 +62,7 @@ If you want to run the stack with TLS enabled, generate certificates and switch ## Running with Docker Compose ```bash -cd examples/microservice +cd packages/stem/example/microservice cp .env.hmac_tls .env # or .env.hmac / .env.ed25519_tls docker compose up --build ``` @@ -74,14 +78,20 @@ The stack now brings up Redis, the enqueue API, three workers, the beat schedule - **Local overrides:** The compose file expects a routed ecosystem checkout next to this repo (sibling directory at `../routed_ecosystem`) so the dashboard overrides resolve correctly. If your local path differs, update the `volumes` entries in `docker-compose.yml` accordingly. -Workers emit metrics to the collector via OTLP; Prometheus scrapes the collector and Grafana ships with a pre-provisioned datasource so you can build dashboards immediately. Jaeger receives spans published through the collector, allowing you to trace enqueue and worker execution paths without extra configuration. +Workers emit metrics to the collector via OTLP; Prometheus scrapes the collector and Grafana ships with pre-provisioned datasources and Stem dashboards: + +- `Stem Overview` +- `Stem Workers & Queues` +- `Stem Scheduler` + +Jaeger receives spans published through the collector, allowing you to trace enqueue and worker execution paths without extra configuration. Enqueue a task: ```bash curl -X POST http://api.localhost:8080/enqueue \ -H 'content-type: application/json' \ - -d '{"name": "Ada"}' + -d '{"name": "Ada", "task": "greeting.send"}' ``` Fan out work with the canvas helper: @@ -106,6 +116,15 @@ stem schedule list stem schedule dry-run --id greetings-reminder --count 3 ``` +The demo stack now auto-generates background traffic across `greetings`, +`billing`, and `reporting` queues, including synthetic workflow metadata +(`stem.workflow.runId`, `stem.workflow.name`, `stem.workflow.step`) so the +Workflows/Jobs/Namespaces dashboard views stay populated. Disable this with: + +```bash +ENQUEUER_AUTOFILL_ENABLED=false docker compose up --build +``` + Stop the stack with `docker compose down`. ## Running locally with Dart @@ -130,7 +149,7 @@ Stop the stack with `docker compose down`. 3. Run the worker: ```bash - cd examples/microservice/worker + cd packages/stem/example/microservice/worker dart pub get dart run bin/worker.dart ``` @@ -138,7 +157,7 @@ Stop the stack with `docker compose down`. 4. In another terminal, run the enqueue API: ```bash - cd examples/microservice/enqueuer + cd packages/stem/example/microservice/enqueuer dart pub get dart run bin/main.dart ``` @@ -146,7 +165,7 @@ Stop the stack with `docker compose down`. The worker logs progress for each greeting task, demonstrating isolate execution, heartbeats, and result backend updates. Start the beat service in a third terminal to dispatch scheduled jobs: ```bash -cd examples/microservice/beat +cd packages/stem/example/microservice/beat dart pub get dart run bin/beat.dart ``` diff --git a/packages/stem/example/microservice/beat/Dockerfile b/packages/stem/example/microservice/beat/Dockerfile index ad1a206f..ef4cd4af 100644 --- a/packages/stem/example/microservice/beat/Dockerfile +++ b/packages/stem/example/microservice/beat/Dockerfile @@ -3,6 +3,7 @@ FROM dart:stable WORKDIR /workspace COPY . /workspace -WORKDIR /workspace/example/microservice/beat +WORKDIR /workspace/packages/stem/example/microservice/beat ENV DART_PUB_CACHE=/tmp/.dart_pub_cache +ENV PATH=/usr/lib/dart/bin:$PATH CMD ["sh", "-c", "dart pub get && dart run bin/beat.dart"] diff --git a/packages/stem/example/microservice/beat/bin/beat.dart b/packages/stem/example/microservice/beat/bin/beat.dart index eabb16b6..8a8195a0 100644 --- a/packages/stem/example/microservice/beat/bin/beat.dart +++ b/packages/stem/example/microservice/beat/bin/beat.dart @@ -11,7 +11,10 @@ const _deepEquals = DeepCollectionEquality(); Future main(List args) async { // #region signing-beat-config final config = StemConfig.fromEnvironment(); + final observability = ObservabilityConfig.fromEnvironment(); // #endregion signing-beat-config + observability.applyMetricExporters(); + observability.applySignalConfiguration(); final broker = await RedisStreamsBroker.connect( config.brokerUrl, diff --git a/packages/stem/example/microservice/beat/pubspec.yaml b/packages/stem/example/microservice/beat/pubspec.yaml index ec8769c7..c842d7eb 100644 --- a/packages/stem/example/microservice/beat/pubspec.yaml +++ b/packages/stem/example/microservice/beat/pubspec.yaml @@ -18,3 +18,5 @@ dev_dependencies: dependency_overrides: stem: path: ../../.. + stem_memory: + path: ../../../../stem_memory diff --git a/packages/stem/example/microservice/dashboard/Dockerfile b/packages/stem/example/microservice/dashboard/Dockerfile index 400543ce..70032857 100644 --- a/packages/stem/example/microservice/dashboard/Dockerfile +++ b/packages/stem/example/microservice/dashboard/Dockerfile @@ -3,6 +3,7 @@ FROM dart:stable WORKDIR /workspace COPY . /workspace -WORKDIR /workspace/dashboard +WORKDIR /workspace/packages/dashboard ENV DART_PUB_CACHE=/tmp/.dart_pub_cache +ENV PATH=/usr/lib/dart/bin:$PATH CMD ["sh", "-c", "dart pub get && dart run bin/dashboard.dart"] diff --git a/packages/stem/example/microservice/docker-compose.yml b/packages/stem/example/microservice/docker-compose.yml index 9ccfa769..4ba0758e 100644 --- a/packages/stem/example/microservice/docker-compose.yml +++ b/packages/stem/example/microservice/docker-compose.yml @@ -16,6 +16,7 @@ services: image: jaegertracing/all-in-one:1.56 environment: COLLECTOR_OTLP_ENABLED: "true" + QUERY_BASE_PATH: /jaeger restart: unless-stopped prometheus: @@ -36,12 +37,13 @@ services: environment: GF_SECURITY_ADMIN_USER: admin GF_SECURITY_ADMIN_PASSWORD: admin - GF_SERVER_DOMAIN: localhost - GF_SERVER_ROOT_URL: "%(protocol)s://%(domain)s:%(http_port)s/grafana/" + GF_SERVER_ROOT_URL: "http://localhost:8080/grafana/" GF_SERVER_SERVE_FROM_SUB_PATH: "true" volumes: - grafana-data:/var/lib/grafana - ./grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yaml:ro + - ./grafana/provisioning/dashboards/stem.yml:/etc/grafana/provisioning/dashboards/stem.yaml:ro + - ./grafana/dashboards:/etc/grafana/dashboards/stem:ro depends_on: - prometheus restart: unless-stopped @@ -61,8 +63,8 @@ services: dashboard: build: - context: ../.. - dockerfile: example/microservice/dashboard/Dockerfile + context: ../../../.. + dockerfile: packages/stem/example/microservice/dashboard/Dockerfile env_file: - .env.example environment: @@ -74,12 +76,16 @@ services: enqueuer: build: - context: ../.. - dockerfile: example/microservice/enqueuer/Dockerfile + context: ../../../.. + dockerfile: packages/stem/example/microservice/enqueuer/Dockerfile env_file: - .env.example environment: PORT: ${PORT:-8081} + ENQUEUER_AUTOFILL_ENABLED: ${ENQUEUER_AUTOFILL_ENABLED:-true} + ENQUEUER_AUTOFILL_INTERVAL_MS: ${ENQUEUER_AUTOFILL_INTERVAL_MS:-2500} + ENQUEUER_AUTOFILL_BATCH_SIZE: ${ENQUEUER_AUTOFILL_BATCH_SIZE:-2} + ENQUEUER_AUTOFILL_FAILURE_EVERY: ${ENQUEUER_AUTOFILL_FAILURE_EVERY:-8} depends_on: - redis - otel-collector @@ -87,10 +93,14 @@ services: worker: build: - context: ../.. - dockerfile: example/microservice/worker/Dockerfile + context: ../../../.. + dockerfile: packages/stem/example/microservice/worker/Dockerfile env_file: - .env.example + environment: + STEM_WORKER_NAME: microservice-worker-1 + STEM_WORKER_QUEUE: greetings + STEM_WORKER_NAMESPACE: customer-experience depends_on: - redis - otel-collector @@ -98,10 +108,14 @@ services: worker2: build: - context: ../.. - dockerfile: example/microservice/worker/Dockerfile + context: ../../../.. + dockerfile: packages/stem/example/microservice/worker/Dockerfile env_file: - .env.example + environment: + STEM_WORKER_NAME: microservice-worker-2 + STEM_WORKER_QUEUE: billing + STEM_WORKER_NAMESPACE: revenue depends_on: - redis - otel-collector @@ -109,10 +123,14 @@ services: worker3: build: - context: ../.. - dockerfile: example/microservice/worker/Dockerfile + context: ../../../.. + dockerfile: packages/stem/example/microservice/worker/Dockerfile env_file: - .env.example + environment: + STEM_WORKER_NAME: microservice-worker-3 + STEM_WORKER_QUEUE: reporting + STEM_WORKER_NAMESPACE: analytics depends_on: - redis - otel-collector @@ -120,8 +138,8 @@ services: beat: build: - context: ../.. - dockerfile: example/microservice/beat/Dockerfile + context: ../../../.. + dockerfile: packages/stem/example/microservice/beat/Dockerfile env_file: - .env.example environment: diff --git a/packages/stem/example/microservice/enqueuer/Dockerfile b/packages/stem/example/microservice/enqueuer/Dockerfile index 9ab07e2e..814c179f 100644 --- a/packages/stem/example/microservice/enqueuer/Dockerfile +++ b/packages/stem/example/microservice/enqueuer/Dockerfile @@ -3,8 +3,9 @@ FROM dart:stable WORKDIR /workspace COPY . /workspace -WORKDIR /workspace/example/microservice/enqueuer +WORKDIR /workspace/packages/stem/example/microservice/enqueuer ENV DART_PUB_CACHE=/tmp/.dart_pub_cache +ENV PATH=/usr/lib/dart/bin:$PATH ENV PORT=8081 EXPOSE 8081 CMD ["sh", "-c", "dart pub get && dart run bin/main.dart"] diff --git a/packages/stem/example/microservice/enqueuer/bin/main.dart b/packages/stem/example/microservice/enqueuer/bin/main.dart index c5e06bf1..20c4a2de 100644 --- a/packages/stem/example/microservice/enqueuer/bin/main.dart +++ b/packages/stem/example/microservice/enqueuer/bin/main.dart @@ -8,10 +8,86 @@ import 'package:shelf_router/shelf_router.dart'; import 'package:stem/stem.dart'; import 'package:stem_redis/stem_redis.dart'; +const _defaultTaskName = 'greeting.send'; + +const _demoTaskSpecs = <_DemoTaskSpec>[ + _DemoTaskSpec( + name: 'greeting.send', + queue: 'greetings', + namespace: 'customer-experience', + maxRetries: 5, + ), + _DemoTaskSpec( + name: 'customer.followup', + queue: 'greetings', + namespace: 'customer-experience', + maxRetries: 4, + ), + _DemoTaskSpec( + name: 'billing.charge', + queue: 'billing', + namespace: 'revenue', + maxRetries: 5, + ), + _DemoTaskSpec( + name: 'billing.settlement', + queue: 'billing', + namespace: 'revenue', + maxRetries: 3, + ), + _DemoTaskSpec( + name: 'reports.aggregate', + queue: 'reporting', + namespace: 'analytics', + maxRetries: 2, + ), + _DemoTaskSpec( + name: 'reports.publish', + queue: 'reporting', + namespace: 'analytics', + maxRetries: 2, + ), +]; + +final _demoTaskByName = { + for (final spec in _demoTaskSpecs) spec.name: spec, +}; + +const _workflowTemplates = <_WorkflowTemplate>[ + _WorkflowTemplate( + name: 'onboarding.v1', + steps: [ + _WorkflowStep(taskName: 'greeting.send', stepName: 'prepare-message'), + _WorkflowStep(taskName: 'billing.charge', stepName: 'charge-account'), + _WorkflowStep(taskName: 'reports.publish', stepName: 'publish-summary'), + ], + ), + _WorkflowTemplate( + name: 'billing.closeout', + steps: [ + _WorkflowStep(taskName: 'billing.charge', stepName: 'capture'), + _WorkflowStep(taskName: 'billing.settlement', stepName: 'settle'), + _WorkflowStep(taskName: 'reports.aggregate', stepName: 'rollup'), + ], + ), + _WorkflowTemplate( + name: 'customer.reengagement', + steps: [ + _WorkflowStep(taskName: 'customer.followup', stepName: 'hydrate-profile'), + _WorkflowStep(taskName: 'greeting.send', stepName: 'send-message'), + _WorkflowStep(taskName: 'reports.publish', stepName: 'write-audit'), + ], + ), +]; + Future main(List args) async { // #region signing-producer-config final config = StemConfig.fromEnvironment(); + final observability = ObservabilityConfig.fromEnvironment(); // #endregion signing-producer-config + observability.applyMetricExporters(); + observability.applySignalConfiguration(); + final broker = await RedisStreamsBroker.connect( config.brokerUrl, tls: config.tls, @@ -31,14 +107,16 @@ Future main(List args) async { // #endregion signing-producer-signer final httpContext = _buildHttpSecurityContext(); - final registry = SimpleTaskRegistry() - ..register( + final registry = SimpleTaskRegistry(); + for (final spec in _demoTaskSpecs) { + registry.register( FunctionTaskHandler( - name: 'greeting.send', + name: spec.name, entrypoint: _placeholderEntrypoint, - options: const TaskOptions(queue: 'greetings', maxRetries: 5), + options: TaskOptions(queue: spec.queue, maxRetries: spec.maxRetries), ), ); + } // #region signing-producer-stem final stem = Stem( @@ -53,23 +131,68 @@ Future main(List args) async { backend: backend, registry: registry, ); + final autoFill = _AutoFillController( + stem: stem, + enabled: _boolFromEnv( + Platform.environment['ENQUEUER_AUTOFILL_ENABLED'], + defaultValue: true, + ), + interval: Duration( + milliseconds: _intFromEnv( + Platform.environment['ENQUEUER_AUTOFILL_INTERVAL_MS'], + defaultValue: 2500, + ), + ), + batchSize: _intFromEnv( + Platform.environment['ENQUEUER_AUTOFILL_BATCH_SIZE'], + defaultValue: 2, + ), + failureEvery: _intFromEnv( + Platform.environment['ENQUEUER_AUTOFILL_FAILURE_EVERY'], + defaultValue: 8, + ), + )..start(); final router = Router() ..post('/enqueue', (Request request) async { final body = jsonDecode(await request.readAsString()) as Map; - final name = (body['name'] as String?)?.trim(); - if (name == null || name.isEmpty) { - return Response.badRequest( - body: jsonEncode({'error': 'Missing "name" field'}), + final requestedTask = (body['task'] as String?)?.trim(); + final taskName = (requestedTask == null || requestedTask.isEmpty) + ? _defaultTaskName + : requestedTask; + final taskSpec = _demoTaskByName[taskName]; + if (taskSpec == null) { + return Response( + HttpStatus.badRequest, + body: jsonEncode({ + 'error': 'Unknown task "$taskName".', + 'knownTasks': _demoTaskSpecs.map((entry) => entry.name).toList(), + }), + headers: {'content-type': 'application/json'}, ); } + final name = (body['name'] as String?)?.trim(); + final entity = (name == null || name.isEmpty) ? 'friend' : name; final taskId = await stem.enqueue( - 'greeting.send', - args: {'name': name}, - options: const TaskOptions(queue: 'greetings'), + taskSpec.name, + args: { + 'name': entity, + if (body['delayMs'] is num) + 'delayMs': (body['delayMs'] as num).toInt(), + if (body['fail'] is bool) 'fail': body['fail'] as bool, + }, + options: TaskOptions( + queue: taskSpec.queue, + maxRetries: taskSpec.maxRetries, + ), + meta: { + 'namespace': taskSpec.namespace, + 'stem.namespace': taskSpec.namespace, + 'demo.source': 'http.enqueue', + }, ); return Response.ok( - jsonEncode({'taskId': taskId}), + jsonEncode({'taskId': taskId, 'task': taskSpec.name}), headers: {'content-type': 'application/json'}, ); }) @@ -142,6 +265,7 @@ Future main(List args) async { Future shutdown(ProcessSignal signal) async { stdout.writeln('Shutting down enqueue service ($signal)...'); + autoFill.stop(); await server.close(force: true); await broker.close(); await backend.close(); @@ -174,3 +298,171 @@ SecurityContext? _buildHttpSecurityContext() { } return context; } + +class _AutoFillController { + _AutoFillController({ + required this.stem, + required this.enabled, + required this.interval, + required this.batchSize, + required this.failureEvery, + }); + + final Stem stem; + final bool enabled; + final Duration interval; + final int batchSize; + final int failureEvery; + + Timer? _timer; + var _tick = 0; + var _running = false; + + void start() { + if (!enabled) return; + stdout.writeln( + 'Auto-fill enabled (interval=${interval.inMilliseconds}ms, ' + 'batchSize=$batchSize, failureEvery=$failureEvery).', + ); + _timer = Timer.periodic(interval, (_) { + if (_running) return; + _running = true; + unawaited(_produce().whenComplete(() => _running = false)); + }); + } + + void stop() { + _timer?.cancel(); + } + + Future _produce() async { + _tick++; + for (var index = 0; index < batchSize; index++) { + final spec = _demoTaskSpecs[(_tick + index) % _demoTaskSpecs.length]; + final shouldFail = failureEvery > 0 && + _tick % failureEvery == 0 && + index == 0 && + (spec.queue == 'greetings' || spec.queue == 'billing'); + final delayMs = 220 + ((_tick + index) % 8) * 160; + final taskId = await _enqueueTask( + spec, + label: 'demo-${_tick.toString().padLeft(4, '0')}-$index', + delayMs: delayMs, + shouldFail: shouldFail, + extraMeta: const {'demo.kind': 'standalone'}, + ); + stdout.writeln( + 'Auto-filled standalone task $taskId ' + '(${spec.name} queue=${spec.queue} delayMs=$delayMs fail=$shouldFail).', + ); + } + + if (_tick.isEven) { + await _enqueueWorkflowSample(); + } + } + + Future _enqueueTask( + _DemoTaskSpec spec, { + required String label, + required int delayMs, + required bool shouldFail, + Map extraMeta = const {}, + }) { + return stem.enqueue( + spec.name, + args: { + 'name': label, + 'delayMs': delayMs, + if (shouldFail) 'fail': true, + }, + options: TaskOptions(queue: spec.queue, maxRetries: spec.maxRetries), + meta: { + 'namespace': spec.namespace, + 'stem.namespace': spec.namespace, + ...extraMeta, + }, + ); + } + + Future _enqueueWorkflowSample() async { + final template = _workflowTemplates[_tick % _workflowTemplates.length]; + final runId = 'wf-${_tick.toString().padLeft(6, '0')}'; + final forceFailure = failureEvery > 0 && _tick % failureEvery == 0; + for (var index = 0; index < template.steps.length; index++) { + final step = template.steps[index]; + final spec = _demoTaskByName[step.taskName]; + if (spec == null) { + continue; + } + final shouldFail = forceFailure && index == template.steps.length - 1; + final delayMs = 280 + ((_tick + index) % 6) * 140; + final taskId = await _enqueueTask( + spec, + label: '$runId-${step.stepName}', + delayMs: delayMs, + shouldFail: shouldFail, + extraMeta: { + 'demo.kind': 'workflow-step', + 'demo.workflow': template.name, + 'stem.workflow.runId': runId, + 'stem.workflow.name': template.name, + 'stem.workflow.step': step.stepName, + 'stem.workflow.stepIndex': index, + 'stem.workflow.iteration': 0, + }, + ); + stdout.writeln( + 'Auto-filled workflow step $taskId ' + '(run=$runId workflow=${template.name} step=${step.stepName}).', + ); + } + } +} + +class _DemoTaskSpec { + const _DemoTaskSpec({ + required this.name, + required this.queue, + required this.namespace, + required this.maxRetries, + }); + + final String name; + final String queue; + final String namespace; + final int maxRetries; +} + +class _WorkflowTemplate { + const _WorkflowTemplate({required this.name, required this.steps}); + + final String name; + final List<_WorkflowStep> steps; +} + +class _WorkflowStep { + const _WorkflowStep({required this.taskName, required this.stepName}); + + final String taskName; + final String stepName; +} + +bool _boolFromEnv(String? value, {required bool defaultValue}) { + final normalized = value?.trim().toLowerCase(); + if (normalized == null || normalized.isEmpty) { + return defaultValue; + } + return normalized == '1' || + normalized == 'true' || + normalized == 'yes' || + normalized == 'on'; +} + +int _intFromEnv(String? value, {required int defaultValue}) { + final parsed = int.tryParse(value?.trim() ?? ''); + if (parsed == null || parsed <= 0) { + return defaultValue; + } + return parsed; +} diff --git a/packages/stem/example/microservice/enqueuer/pubspec.yaml b/packages/stem/example/microservice/enqueuer/pubspec.yaml index a0cc865a..23a48edb 100644 --- a/packages/stem/example/microservice/enqueuer/pubspec.yaml +++ b/packages/stem/example/microservice/enqueuer/pubspec.yaml @@ -18,4 +18,6 @@ dependency_overrides: stem: path: ../../.. stem_redis: - path: ../../../../stem_redis \ No newline at end of file + path: ../../../../stem_redis + stem_memory: + path: ../../../../stem_memory diff --git a/packages/stem/example/microservice/grafana-datasources.yml b/packages/stem/example/microservice/grafana-datasources.yml index 64d3212c..ef926e7a 100644 --- a/packages/stem/example/microservice/grafana-datasources.yml +++ b/packages/stem/example/microservice/grafana-datasources.yml @@ -1,13 +1,13 @@ apiVersion: 1 datasources: - name: Prometheus + uid: stem-prometheus type: prometheus access: proxy - url: http://prometheus:9090 + url: http://prometheus:9090/prometheus isDefault: true - jsonData: - timeInterval: 15s - name: Jaeger + uid: stem-jaeger type: jaeger access: proxy - url: http://jaeger:16686 + url: http://jaeger:16686/jaeger diff --git a/packages/stem/example/microservice/grafana/dashboards/stem-overview.json b/packages/stem/example/microservice/grafana/dashboards/stem-overview.json new file mode 100644 index 00000000..f421e5f5 --- /dev/null +++ b/packages/stem/example/microservice/grafana/dashboards/stem-overview.json @@ -0,0 +1,450 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(stem_tasks_started_total[$__rate_interval]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Tasks Started / sec", + "type": "stat" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(stem_tasks_succeeded_total[$__rate_interval]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Tasks Succeeded / sec", + "type": "stat" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(stem_tasks_failed_total[$__rate_interval]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Tasks Failed / sec", + "type": "stat" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 12, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(stem_tasks_retried_total[$__rate_interval]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Retries / sec", + "type": "stat" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 0 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "count(stem_worker_concurrency)", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Active Workers", + "type": "stat" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 0 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(stem_queue_depth)", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Queue Depth (Total)", + "type": "stat" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 16, + "x": 0, + "y": 5 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(stem_tasks_started_total[$__rate_interval]))", + "legendFormat": "started", + "range": true, + "refId": "A" + }, + { + "editorMode": "code", + "expr": "sum(rate(stem_tasks_succeeded_total[$__rate_interval]))", + "legendFormat": "succeeded", + "range": true, + "refId": "B" + }, + { + "editorMode": "code", + "expr": "sum(rate(stem_tasks_failed_total[$__rate_interval]))", + "legendFormat": "failed", + "range": true, + "refId": "C" + }, + { + "editorMode": "code", + "expr": "sum(rate(stem_tasks_retried_total[$__rate_interval]))", + "legendFormat": "retried", + "range": true, + "refId": "D" + } + ], + "title": "Task Throughput", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 5 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le, task) (rate(stem_task_duration_seconds_bucket[$__rate_interval])))", + "legendFormat": "{{task}} p95", + "range": true, + "refId": "A" + } + ], + "title": "Task Duration p95 by Task", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 9, + "options": { + "showHeader": true + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (queue) (stem_queue_depth)", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "title": "Queue Depth by Queue", + "type": "table" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 10, + "options": { + "showHeader": true + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (worker, namespace) (stem_worker_inflight)", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "title": "Worker Inflight", + "type": "table" + } + ], + "refresh": "15s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "stem", + "overview" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Stem Overview", + "uid": "stem-overview", + "version": 1, + "weekStart": "" +} diff --git a/packages/stem/example/microservice/grafana/dashboards/stem-scheduler.json b/packages/stem/example/microservice/grafana/dashboards/stem-scheduler.json new file mode 100644 index 00000000..ed560010 --- /dev/null +++ b/packages/stem/example/microservice/grafana/dashboards/stem-scheduler.json @@ -0,0 +1,340 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(stem_scheduler_due_entries)", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Due Entries", + "type": "stat" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(stem_scheduler_overdue_entries)", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Overdue Entries", + "type": "stat" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(stem_scheduler_dispatch_success_total[$__rate_interval]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Dispatch Success / sec", + "type": "stat" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(stem_scheduler_dispatch_attempts_total[$__rate_interval])) - sum(rate(stem_scheduler_dispatch_success_total[$__rate_interval]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Dispatch Failures / sec", + "type": "stat" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(stem_scheduler_dispatch_attempts_total[$__rate_interval]))", + "legendFormat": "attempts", + "range": true, + "refId": "A" + }, + { + "editorMode": "code", + "expr": "sum(rate(stem_scheduler_dispatch_success_total[$__rate_interval]))", + "legendFormat": "success", + "range": true, + "refId": "B" + } + ], + "title": "Scheduler Dispatch Throughput", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(stem_scheduler_drift_seconds_bucket[$__rate_interval])))", + "legendFormat": "drift p95", + "range": true, + "refId": "A" + }, + { + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(stem_scheduler_overdue_lag_seconds_bucket[$__rate_interval])))", + "legendFormat": "overdue lag p95", + "range": true, + "refId": "B" + } + ], + "title": "Scheduler Latency (p95)", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 7, + "options": { + "showHeader": true + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(stem_scheduler_lock_acquired_total[$__rate_interval]))", + "format": "table", + "instant": true, + "refId": "A" + }, + { + "editorMode": "code", + "expr": "sum(rate(stem_scheduler_lock_contended_total[$__rate_interval]))", + "format": "table", + "instant": true, + "refId": "B" + } + ], + "title": "Scheduler Lock Rates", + "type": "table" + } + ], + "refresh": "15s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "stem", + "scheduler" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Stem Scheduler", + "uid": "stem-scheduler", + "version": 1, + "weekStart": "" +} diff --git a/packages/stem/example/microservice/grafana/dashboards/stem-workers-queues.json b/packages/stem/example/microservice/grafana/dashboards/stem-workers-queues.json new file mode 100644 index 00000000..98c71a66 --- /dev/null +++ b/packages/stem/example/microservice/grafana/dashboards/stem-workers-queues.json @@ -0,0 +1,330 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (worker) (stem_worker_concurrency{namespace=~\"$namespace\"})", + "legendFormat": "{{worker}}", + "range": true, + "refId": "A" + } + ], + "title": "Worker Concurrency", + "type": "bargauge" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (worker) (stem_worker_inflight{namespace=~\"$namespace\"})", + "legendFormat": "{{worker}}", + "range": true, + "refId": "A" + } + ], + "title": "Worker Inflight", + "type": "bargauge" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (queue) (stem_queue_depth{queue=~\"$queue\"})", + "legendFormat": "{{queue}}", + "range": true, + "refId": "A" + } + ], + "title": "Queue Depth", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (task, queue) (rate(stem_tasks_started_total{queue=~\"$queue\"}[$__rate_interval]))", + "legendFormat": "{{queue}} / {{task}}", + "range": true, + "refId": "A" + } + ], + "title": "Task Starts by Queue/Task", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (task, queue) (rate(stem_tasks_failed_total{queue=~\"$queue\"}[$__rate_interval]))", + "legendFormat": "failed {{queue}} / {{task}}", + "range": true, + "refId": "A" + }, + { + "editorMode": "code", + "expr": "sum by (task, queue) (rate(stem_tasks_retried_total{queue=~\"$queue\"}[$__rate_interval]))", + "legendFormat": "retried {{queue}} / {{task}}", + "range": true, + "refId": "B" + } + ], + "title": "Failures and Retries", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 6, + "options": { + "showHeader": true + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (worker, namespace) (stem_worker_concurrency{namespace=~\"$namespace\"})", + "format": "table", + "instant": true, + "refId": "A" + }, + { + "editorMode": "code", + "expr": "sum by (worker, namespace) (stem_worker_inflight{namespace=~\"$namespace\"})", + "format": "table", + "instant": true, + "refId": "B" + } + ], + "title": "Worker Snapshot", + "type": "table" + } + ], + "refresh": "15s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "stem", + "workers", + "queues" + ], + "templating": { + "list": [ + { + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "namespace", + "options": [], + "query": "label_values(stem_worker_concurrency, namespace)", + "refresh": 2, + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": {"type": "prometheus", "uid": "stem-prometheus"}, + "hide": 0, + "includeAll": true, + "label": "Queue", + "multi": true, + "name": "queue", + "options": [], + "query": "label_values(stem_queue_depth, queue)", + "refresh": 2, + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Stem Workers & Queues", + "uid": "stem-workers-queues", + "version": 1, + "weekStart": "" +} diff --git a/packages/stem/example/microservice/grafana/provisioning/dashboards/stem.yml b/packages/stem/example/microservice/grafana/provisioning/dashboards/stem.yml new file mode 100644 index 00000000..25d49b05 --- /dev/null +++ b/packages/stem/example/microservice/grafana/provisioning/dashboards/stem.yml @@ -0,0 +1,13 @@ +apiVersion: 1 + +providers: + - name: Stem + orgId: 1 + folder: Stem + type: file + disableDeletion: false + editable: true + updateIntervalSeconds: 15 + allowUiUpdates: true + options: + path: /etc/grafana/dashboards/stem diff --git a/packages/stem/example/microservice/nginx.conf b/packages/stem/example/microservice/nginx.conf index 0b952b23..812af798 100644 --- a/packages/stem/example/microservice/nginx.conf +++ b/packages/stem/example/microservice/nginx.conf @@ -37,30 +37,48 @@ http { } # Grafana dashboard (served from /grafana/) + location = /grafana { + return 301 /grafana/; + } + location /grafana/ { - proxy_pass http://grafana:3000/; + proxy_pass http://grafana:3000; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection $connection_upgrade; } # Prometheus UI (served from /prometheus/) + location = /prometheus { + return 301 /prometheus/; + } + location /prometheus/ { - proxy_pass http://prometheus:9090/; + proxy_pass http://prometheus:9090; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; } # Jaeger UI (served from /jaeger/) + location = /jaeger { + return 301 /jaeger/; + } + location /jaeger/ { - proxy_pass http://jaeger:16686/; + proxy_pass http://jaeger:16686; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; } } } diff --git a/packages/stem/example/microservice/schedules.example.yaml b/packages/stem/example/microservice/schedules.example.yaml index f0f608fd..99382344 100644 --- a/packages/stem/example/microservice/schedules.example.yaml +++ b/packages/stem/example/microservice/schedules.example.yaml @@ -2,9 +2,71 @@ entries: - id: greetings-reminder task: greeting.send queue: greetings - spec: every:1m - jitter: 10s + spec: every:30s + jitter: 5s args: name: scheduled friend + delayMs: 500 meta: seeded: true + + - id: greetings-followup + task: customer.followup + queue: greetings + spec: every:12s + jitter: 1s + args: + name: scheduled followup + delayMs: 450 + meta: + seeded: true + namespace: customer-experience + + - id: billing-charge + task: billing.charge + queue: billing + spec: every:18s + jitter: 2s + args: + name: scheduled charge + delayMs: 700 + meta: + seeded: true + namespace: revenue + + - id: billing-settlement + task: billing.settlement + queue: billing + spec: every:40s + jitter: 3s + args: + name: scheduled settlement + delayMs: 900 + meta: + seeded: true + namespace: revenue + + - id: reports-aggregate + task: reports.aggregate + queue: reporting + spec: every:25s + jitter: 2s + args: + name: scheduled aggregate + delayMs: 850 + meta: + seeded: true + namespace: analytics + + - id: reports-publish-flaky + task: reports.publish + queue: reporting + spec: every:55s + jitter: 3s + args: + name: scheduled publish + fail: true + delayMs: 650 + meta: + seeded: true + namespace: analytics diff --git a/packages/stem/example/microservice/worker/Dockerfile b/packages/stem/example/microservice/worker/Dockerfile index 32cfa43b..899c89fd 100644 --- a/packages/stem/example/microservice/worker/Dockerfile +++ b/packages/stem/example/microservice/worker/Dockerfile @@ -3,6 +3,7 @@ FROM dart:stable WORKDIR /workspace COPY . /workspace -WORKDIR /workspace/example/microservice/worker +WORKDIR /workspace/packages/stem/example/microservice/worker ENV DART_PUB_CACHE=/tmp/.dart_pub_cache +ENV PATH=/usr/lib/dart/bin:$PATH CMD ["sh", "-c", "dart pub get && dart run bin/worker.dart"] diff --git a/packages/stem/example/microservice/worker/bin/worker.dart b/packages/stem/example/microservice/worker/bin/worker.dart index 46bfaa92..0f24a19a 100644 --- a/packages/stem/example/microservice/worker/bin/worker.dart +++ b/packages/stem/example/microservice/worker/bin/worker.dart @@ -4,6 +4,60 @@ import 'dart:io'; import 'package:stem/stem.dart'; import 'package:stem_redis/stem_redis.dart'; +const _taskSpecs = <_WorkerTaskSpec>[ + _WorkerTaskSpec( + name: 'greeting.send', + queue: 'greetings', + maxRetries: 5, + softLimit: Duration(seconds: 10), + hardLimit: Duration(seconds: 20), + ), + _WorkerTaskSpec( + name: 'customer.followup', + queue: 'greetings', + maxRetries: 4, + softLimit: Duration(seconds: 12), + hardLimit: Duration(seconds: 22), + ), + _WorkerTaskSpec( + name: 'billing.charge', + queue: 'billing', + maxRetries: 5, + softLimit: Duration(seconds: 12), + hardLimit: Duration(seconds: 24), + ), + _WorkerTaskSpec( + name: 'billing.settlement', + queue: 'billing', + maxRetries: 3, + softLimit: Duration(seconds: 10), + hardLimit: Duration(seconds: 18), + ), + _WorkerTaskSpec( + name: 'reports.aggregate', + queue: 'reporting', + maxRetries: 2, + softLimit: Duration(seconds: 12), + hardLimit: Duration(seconds: 24), + ), + _WorkerTaskSpec( + name: 'reports.publish', + queue: 'reporting', + maxRetries: 2, + softLimit: Duration(seconds: 10), + hardLimit: Duration(seconds: 18), + ), +]; + +final _taskEntrypoints = { + 'greeting.send': _greetingSendEntrypoint, + 'customer.followup': _customerFollowupEntrypoint, + 'billing.charge': _billingChargeEntrypoint, + 'billing.settlement': _billingSettlementEntrypoint, + 'reports.aggregate': _reportsAggregateEntrypoint, + 'reports.publish': _reportsPublishEntrypoint, +}; + Future main(List args) async { // #region signing-worker-config final config = StemConfig.fromEnvironment(); @@ -22,29 +76,44 @@ Future main(List args) async { final signer = PayloadSigner.maybe(config.signing); // #endregion signing-worker-signer - final registry = SimpleTaskRegistry() - ..register( + final registry = SimpleTaskRegistry(); + for (final spec in _taskSpecs) { + final entrypoint = _taskEntrypoints[spec.name]; + if (entrypoint == null) { + throw StateError('Missing task entrypoint for ${spec.name}'); + } + registry.register( FunctionTaskHandler( - name: 'greeting.send', - entrypoint: _greetingEntrypoint, - options: const TaskOptions( - queue: 'greetings', - maxRetries: 5, - softTimeLimit: Duration(seconds: 10), - hardTimeLimit: Duration(seconds: 20), + name: spec.name, + entrypoint: entrypoint, + options: TaskOptions( + queue: spec.queue, + maxRetries: spec.maxRetries, + softTimeLimit: spec.softLimit, + hardTimeLimit: spec.hardLimit, ), ), ); + } final observability = ObservabilityConfig.fromEnvironment(); + final configuredWorkerName = Platform.environment['STEM_WORKER_NAME']?.trim(); + final configuredQueue = Platform.environment['STEM_WORKER_QUEUE']?.trim(); + final queue = configuredQueue != null && configuredQueue.isNotEmpty + ? configuredQueue + : 'greetings'; + final resolvedWorkerName = + configuredWorkerName != null && configuredWorkerName.isNotEmpty + ? configuredWorkerName + : 'microservice-worker-${Platform.environment['HOSTNAME'] ?? pid}'; // #region signing-worker-wire final worker = Worker( broker: broker, registry: registry, backend: backend, - queue: 'greetings', - consumerName: 'microservice-worker', + queue: queue, + consumerName: resolvedWorkerName, concurrency: 4, prefetchMultiplier: 2, signer: signer, @@ -53,7 +122,9 @@ Future main(List args) async { // #endregion signing-worker-wire await worker.start(); - stdout.writeln('Worker listening for greetings...'); + stdout.writeln( + 'Worker "$resolvedWorkerName" listening on queue "$queue"...', + ); ProcessSignal.sigint.watch().listen((_) async { stdout.writeln('Stopping worker...'); @@ -68,15 +139,89 @@ Future main(List args) async { await Completer().future; // Keep process alive } -FutureOr _greetingEntrypoint( +FutureOr _greetingSendEntrypoint( + TaskInvocationContext context, + Map args, +) => + _taskEntrypoint('greeting.send', context, args); + +FutureOr _customerFollowupEntrypoint( + TaskInvocationContext context, + Map args, +) => + _taskEntrypoint('customer.followup', context, args); + +FutureOr _billingChargeEntrypoint( + TaskInvocationContext context, + Map args, +) => + _taskEntrypoint('billing.charge', context, args); + +FutureOr _billingSettlementEntrypoint( + TaskInvocationContext context, + Map args, +) => + _taskEntrypoint('billing.settlement', context, args); + +FutureOr _reportsAggregateEntrypoint( + TaskInvocationContext context, + Map args, +) => + _taskEntrypoint('reports.aggregate', context, args); + +FutureOr _reportsPublishEntrypoint( + TaskInvocationContext context, + Map args, +) => + _taskEntrypoint('reports.publish', context, args); + +FutureOr _taskEntrypoint( + String taskName, TaskInvocationContext context, Map args, ) async { final name = (args['name'] as String?) ?? 'friend'; - context.heartbeat(); - await Future.delayed(const Duration(milliseconds: 500)); - final message = 'Processed greeting for $name '; - stdout.writeln('👋 $message (attempt ${context.attempt})'); + final fail = args['fail'] == true; + final delayMsRaw = switch (args['delayMs']) { + int value => value, + num value => value.toInt(), + String value => int.tryParse(value), + _ => null, + }; + final delayMs = delayMsRaw == null || delayMsRaw <= 0 ? 500 : delayMsRaw; + final totalSteps = (delayMs / 200).ceil().clamp(1, 60); + for (var step = 1; step <= totalSteps; step++) { + context.heartbeat(); + context.progress(step / totalSteps, data: { + 'step': step, + 'totalSteps': totalSteps, + 'name': name, + }); + await Future.delayed(const Duration(milliseconds: 200)); + } + if (fail) { + throw StateError( + 'Synthetic failure requested for task=$taskName label=$name', + ); + } + final message = 'Processed $taskName for $name'; + stdout.writeln('$message (attempt ${context.attempt})'); context.progress(1.0, data: {'message': message}); return message; } + +class _WorkerTaskSpec { + const _WorkerTaskSpec({ + required this.name, + required this.queue, + required this.maxRetries, + required this.softLimit, + required this.hardLimit, + }); + + final String name; + final String queue; + final int maxRetries; + final Duration softLimit; + final Duration hardLimit; +} diff --git a/packages/stem/example/microservice/worker/pubspec.yaml b/packages/stem/example/microservice/worker/pubspec.yaml index 65a36a5d..adfdc68f 100644 --- a/packages/stem/example/microservice/worker/pubspec.yaml +++ b/packages/stem/example/microservice/worker/pubspec.yaml @@ -17,3 +17,5 @@ dev_dependencies: dependency_overrides: stem: path: ../../.. + stem_memory: + path: ../../../../stem_memory diff --git a/packages/stem/lib/src/core/contracts.dart b/packages/stem/lib/src/core/contracts.dart index 38c332d3..515cfe55 100644 --- a/packages/stem/lib/src/core/contracts.dart +++ b/packages/stem/lib/src/core/contracts.dart @@ -260,6 +260,58 @@ class TaskStatus { /// The attempt number for this task execution. final int attempt; + /// Task name extracted from metadata (`task` / `stem.task`). + String? get taskName => + meta['task']?.toString() ?? meta['stem.task']?.toString(); + + /// Queue name extracted from metadata (`queue` / `stem.queue`). + String? get queueName => + meta['queue']?.toString() ?? meta['stem.queue']?.toString(); + + /// Namespace extracted from metadata (`namespace` / `stem.namespace`). + String? get namespace => + meta['namespace']?.toString() ?? meta['stem.namespace']?.toString(); + + /// Worker id that reported this status, if available. + String? get workerId => meta['worker']?.toString(); + + /// Processing start timestamp recorded by the worker, if present. + DateTime? get startedAt => _taskStatusDate(meta['startedAt']); + + /// Completion timestamp recorded by the worker, if present. + DateTime? get completedAt => _taskStatusDate(meta['completedAt']); + + /// Failure timestamp recorded by the worker, if present. + DateTime? get failedAt => _taskStatusDate(meta['failedAt']); + + /// Revocation timestamp when this task was revoked, if present. + DateTime? get revokedAt => _taskStatusDate(meta['revokedAt']); + + /// Revocation reason, when recorded. + String? get revokedReason => meta['revokedReason']?.toString(); + + /// Actor that requested revocation, when recorded. + String? get revokedBy => meta['revokedBy']?.toString(); + + /// Whether this status indicates a revoked task. + bool get wasRevoked => meta['revoked'] == true || revokedAt != null; + + /// Whether this status indicates a time-limit expiration. + bool get isExpired => meta['stem.expired'] == true || meta['expired'] == true; + + /// Hard execution time limit reported by worker metadata, if present. + Duration? get hardTimeLimit => _taskStatusDuration(meta['stem.timeLimitMs']); + + /// Soft execution time limit reported by worker metadata, if present. + Duration? get softTimeLimit => + _taskStatusDuration(meta['stem.softTimeLimitMs']); + + /// Parent task id in a lineage chain, if present. + String? get parentTaskId => meta['stem.parentTaskId']?.toString(); + + /// Root task id in a lineage chain, if present. + String? get rootTaskId => meta['stem.rootTaskId']?.toString(); + /// Serializes this status to JSON. Map toJson() => { 'id': id, @@ -271,6 +323,21 @@ class TaskStatus { }; } +DateTime? _taskStatusDate(Object? value) { + if (value == null) return null; + if (value is DateTime) return value.toUtc(); + return DateTime.tryParse(value.toString())?.toUtc(); +} + +Duration? _taskStatusDuration(Object? value) { + if (value is num) { + return Duration(milliseconds: value.toInt()); + } + final parsed = int.tryParse(value?.toString() ?? ''); + if (parsed == null) return null; + return Duration(milliseconds: parsed); +} + /// Immutable record representing a persisted task status with timestamps. class TaskStatusRecord { /// Creates a task status record. diff --git a/packages/stem/lib/src/core/stem.dart b/packages/stem/lib/src/core/stem.dart index 65a03d4c..74f4f629 100644 --- a/packages/stem/lib/src/core/stem.dart +++ b/packages/stem/lib/src/core/stem.dart @@ -53,6 +53,7 @@ library; import 'dart:async'; +import 'dart:io'; import 'dart:math' as math; import 'package:contextual/contextual.dart'; @@ -188,19 +189,69 @@ class Stem implements TaskEnqueuer { final metadata = handler.metadata; final argsEncoder = _resolveArgsEncoder(handler); final resultEncoder = _resolveResultEncoder(handler); + final scopeMeta = TaskEnqueueScope.currentMeta(); + final mergedMeta = scopeMeta == null + ? meta + : { + ...scopeMeta, + ...meta, + }; + final enrichedMeta = _applyEnqueueOptionsToMeta( + mergedMeta, + enqueueOptions, + ); + if (!enrichedMeta.containsKey('stem.task')) { + enrichedMeta['stem.task'] = name; + } + if (options.retryPolicy != null && + !enrichedMeta.containsKey('stem.retryPolicy')) { + enrichedMeta['stem.retryPolicy'] = options.retryPolicy!.toJson(); + } + + final scheduledAt = _resolveNotBefore( + notBefore, + enqueueOptions, + ); + final maxRetries = _resolveMaxRetries( + options, + handler.options, + enqueueOptions, + ); + final taskId = enqueueOptions?.taskId ?? generateEnvelopeId(); final spanAttributes = { 'stem.task': name, + 'stem.task.id': taskId, + 'stem.task.attempt': 0, + 'stem.task.max_retries': maxRetries, + 'stem.task.priority': resolvedPriority, 'stem.queue': targetName, 'stem.routing.target_type': decision.isBroadcast ? 'broadcast' : 'queue', 'stem.task.idempotent': metadata.idempotent, }; + if (scheduledAt != null) { + spanAttributes['stem.task.not_before'] = scheduledAt + .toUtc() + .toIso8601String(); + } if (metadata.description != null && metadata.description!.isNotEmpty) { spanAttributes['stem.task.description'] = metadata.description!; } if (metadata.tags.isNotEmpty) { spanAttributes['stem.task.tags'] = List.from(metadata.tags); } + final producerHost = _safeLocalHostname(); + if (producerHost != null) { + spanAttributes['host.name'] = producerHost; + } + _appendTracingMetaAttributes(spanAttributes, enrichedMeta); + + // Prefer explicit wire headers when present, but still fall back to the + // current ambient span so in-process producers preserve parent linkage. + final producerParentContext = tracer.extractTraceContext( + headers, + context: tracer.ambientContextOrNull(), + ); return tracer.trace( 'stem.enqueue', @@ -212,41 +263,12 @@ class Stem implements TaskEnqueuer { argsEncoder, ); final encodedArgs = _encodeArgs(args, argsEncoder); - final scopeMeta = TaskEnqueueScope.currentMeta(); - final mergedMeta = scopeMeta == null - ? meta - : { - ...scopeMeta, - ...meta, - }; - final enrichedMeta = _applyEnqueueOptionsToMeta( - mergedMeta, - enqueueOptions, - ); - if (!enrichedMeta.containsKey('stem.task')) { - enrichedMeta['stem.task'] = name; - } - if (options.retryPolicy != null && - !enrichedMeta.containsKey('stem.retryPolicy')) { - enrichedMeta['stem.retryPolicy'] = options.retryPolicy!.toJson(); - } final encodedMeta = _withArgsEncoderMeta(enrichedMeta, argsEncoder); - final scheduledAt = _resolveNotBefore( - notBefore, - enqueueOptions, - ); - - final maxRetries = _resolveMaxRetries( - options, - handler.options, - enqueueOptions, - ); - var envelope = Envelope( name: name, args: encodedArgs, - id: enqueueOptions?.taskId, + id: taskId, headers: encodedHeaders, queue: targetName, notBefore: scheduledAt, @@ -358,6 +380,7 @@ class Stem implements TaskEnqueuer { return envelope.id; }, + context: producerParentContext, spanKind: dotel.SpanKind.producer, attributes: spanAttributes, ); @@ -574,6 +597,135 @@ class Stem implements TaskEnqueuer { return meta; } + void _appendTracingMetaAttributes( + Map attributes, + Map meta, + ) { + final namespace = _metaString(meta, const ['stem.namespace', 'namespace']); + if (namespace != null) { + attributes['stem.namespace'] = namespace; + } + + final parentTaskId = _metaString(meta, const ['stem.parentTaskId']); + if (parentTaskId != null) { + attributes['stem.parent_task_id'] = parentTaskId; + } + + final rootTaskId = _metaString(meta, const ['stem.rootTaskId']); + if (rootTaskId != null) { + attributes['stem.root_task_id'] = rootTaskId; + } + + final workflowRunId = _metaString(meta, const [ + 'stem.workflow.runId', + 'workflow.runId', + 'stem.workflow.run_id', + ]); + if (workflowRunId != null) { + attributes['stem.workflow.run_id'] = workflowRunId; + } + + final workflowName = _metaString(meta, const [ + 'stem.workflow.name', + 'workflow.name', + ]); + if (workflowName != null) { + attributes['stem.workflow.name'] = workflowName; + } + + final workflowStep = _metaString(meta, const [ + 'stem.workflow.step', + 'workflow.step', + 'stem.workflow.stepName', + 'workflow.stepName', + 'stepName', + 'step', + ]); + if (workflowStep != null) { + attributes['stem.workflow.step'] = workflowStep; + } + + final workflowStepId = _metaString(meta, const [ + 'stem.workflow.stepId', + 'workflow.stepId', + 'stepId', + ]); + if (workflowStepId != null) { + attributes['stem.workflow.step_id'] = workflowStepId; + } + + final workflowStepIndex = _metaInt(meta, const [ + 'stem.workflow.stepIndex', + 'stem.workflow.step_index', + ]); + if (workflowStepIndex != null) { + attributes['stem.workflow.step_index'] = workflowStepIndex; + } + + final workflowIteration = _metaInt(meta, const [ + 'stem.workflow.iteration', + ]); + if (workflowIteration != null) { + attributes['stem.workflow.iteration'] = workflowIteration; + } + + final workflowStepAttempt = _metaInt(meta, const [ + 'stem.workflow.stepAttempt', + 'workflow.stepAttempt', + 'stepAttempt', + ]); + if (workflowStepAttempt != null) { + attributes['stem.workflow.step_attempt'] = workflowStepAttempt; + } + } + + String? _metaString( + Map meta, + List keys, + ) { + for (final key in keys) { + final value = meta[key]; + if (value is String) { + final trimmed = value.trim(); + if (trimmed.isNotEmpty) { + return trimmed; + } + } + } + return null; + } + + int? _metaInt( + Map meta, + List keys, + ) { + for (final key in keys) { + final value = meta[key]; + if (value is int) { + return value; + } + if (value is num) { + return value.toInt(); + } + if (value is String) { + final parsed = int.tryParse(value.trim()); + if (parsed != null) { + return parsed; + } + } + } + return null; + } + + static String? _safeLocalHostname() { + try { + final hostname = Platform.localHostname.trim(); + return hostname.isEmpty ? null : hostname; + } on Object { + return null; + } + } + /// Publishes a task with optional retry policy. Future _publishWithRetry( Envelope envelope, { diff --git a/packages/stem/lib/src/observability/metrics.dart b/packages/stem/lib/src/observability/metrics.dart index 4b891608..7f5035a4 100644 --- a/packages/stem/lib/src/observability/metrics.dart +++ b/packages/stem/lib/src/observability/metrics.dart @@ -569,6 +569,12 @@ class _DartasticMetricsRuntime { Future _start() async { try { final grpcEndpoint = _normaliseGrpcEndpoint(endpoint); + final traceExporter = dotel.OtlpGrpcSpanExporter( + dotel.OtlpGrpcExporterConfig( + endpoint: grpcEndpoint.toString(), + insecure: grpcEndpoint.scheme != 'https', + ), + ); final exporter = dotel.OtlpGrpcMetricExporter( dotel.OtlpGrpcMetricExporterConfig( endpoint: grpcEndpoint.toString(), @@ -583,9 +589,9 @@ class _DartasticMetricsRuntime { serviceName: serviceName, endpoint: grpcEndpoint.toString(), secure: grpcEndpoint.scheme == 'https', + spanProcessor: dotel.BatchSpanProcessor(traceExporter), metricExporter: exporter, metricReader: reader, - spanProcessor: _NoopSpanProcessor(), ); _meter = dotel.OTel.meterProvider().getMeter(name: 'stem'); _initialized = true; @@ -657,26 +663,6 @@ class _DartasticMetricsRuntime { } } -/// Span processor that drops all tracing data (metrics-only usage). -class _NoopSpanProcessor extends dotel.SpanProcessor { - _NoopSpanProcessor(); - - @override - Future onStart(dotel.Span span, dotel.Context? parentContext) async {} - - @override - Future onEnd(dotel.Span span) async {} - - @override - Future onNameUpdate(dotel.Span span, String newName) async {} - - @override - Future shutdown() async {} - - @override - Future forceFlush() async {} -} - Uri _normaliseGrpcEndpoint(Uri endpoint) { final useHttps = endpoint.scheme == 'https'; final defaultPort = useHttps ? 443 : 80; diff --git a/packages/stem/lib/src/observability/tracing.dart b/packages/stem/lib/src/observability/tracing.dart index 15f3bc55..9ee0cd85 100644 --- a/packages/stem/lib/src/observability/tracing.dart +++ b/packages/stem/lib/src/observability/tracing.dart @@ -117,7 +117,10 @@ class StemTracer { if (!_isTelemetryReady) { return context ?? _fallbackContext(); } - final baseContext = context ?? dotel.Context.current; + // Default to a fresh context when no explicit parent is supplied. + // Using ambient context here can accidentally chain unrelated async + // deliveries into one long trace when headers are missing traceparent. + final baseContext = context ?? _fallbackContext(); final spanContext = _parseTraceContext(headers); if (spanContext == null) return baseContext; return baseContext.withSpanContext(spanContext); @@ -134,6 +137,19 @@ class StemTracer { }; } + /// Returns the current ambient tracing context when telemetry is ready. + /// + /// Returns `null` if OpenTelemetry is not initialized or no ambient context + /// can be resolved safely. + dotel.Context? ambientContextOrNull() { + if (!_isTelemetryReady) return null; + try { + return dotel.Context.current; + } on Object { + return null; + } + } + dotel.SpanContext? _spanContextFrom(dotel.Context context) { final span = context.span; if (span != null && span.spanContext.isValid) { diff --git a/packages/stem/lib/src/worker/worker.dart b/packages/stem/lib/src/worker/worker.dart index 1889c5b5..d7ccc05c 100644 --- a/packages/stem/lib/src/worker/worker.dart +++ b/packages/stem/lib/src/worker/worker.dart @@ -720,9 +720,14 @@ class Worker { final envelope = delivery.envelope; final tracer = StemTracer.instance; final parentContext = tracer.extractTraceContext(envelope.headers); - final spanAttributes = { - 'stem.task': envelope.name, - 'stem.queue': envelope.queue, + final baseSpanAttributes = _deliverySpanAttributes(envelope); + final consumeSpanAttributes = { + ...baseSpanAttributes, + 'stem.span.phase': 'consume', + }; + final executeSpanAttributes = { + ...baseSpanAttributes, + 'stem.span.phase': 'execute', }; await tracer.trace( @@ -987,7 +992,7 @@ class Worker { decodedArgs, ), ), - attributes: spanAttributes, + attributes: executeSpanAttributes, ); _cancelLeaseTimer(delivery.receipt); @@ -1121,7 +1126,7 @@ class Worker { }, context: parentContext, spanKind: dotel.SpanKind.consumer, - attributes: spanAttributes, + attributes: consumeSpanAttributes, ); } @@ -2027,14 +2032,13 @@ class Worker { retryPolicy, ); final nextRunAt = stemNow().add(delay); - await broker.nack(delivery, requeue: false); - await broker.publish( - envelope.copyWith( - attempt: envelope.attempt + 1, - maxRetries: maxRetries, - notBefore: stemNow().add(delay), - ), + final retryEnvelope = envelope.copyWith( + attempt: envelope.attempt + 1, + maxRetries: maxRetries, + notBefore: nextRunAt, ); + await broker.nack(delivery, requeue: false); + await _publishWithOptionalSigning(retryEnvelope); final retriedMeta = _statusMeta( envelope, resultEncoder, @@ -2226,15 +2230,14 @@ class Worker { updatedMeta['stem.retryPolicy'] = request.retryPolicy!.toJson(); } - await broker.nack(delivery, requeue: false); - await broker.publish( - envelope.copyWith( - attempt: envelope.attempt + 1, - maxRetries: maxRetries, - notBefore: notBefore, - meta: updatedMeta, - ), + final retryEnvelope = envelope.copyWith( + attempt: envelope.attempt + 1, + maxRetries: maxRetries, + notBefore: notBefore, + meta: updatedMeta, ); + await broker.nack(delivery, requeue: false); + await _publishWithOptionalSigning(retryEnvelope); final retriedMeta = _statusMeta( envelope, @@ -2296,10 +2299,9 @@ class Worker { required Duration backoff, Map extra = const {}, }) async { + final retryEnvelope = envelope.copyWith(notBefore: stemNow().add(backoff)); await broker.nack(delivery, requeue: false); - await broker.publish( - envelope.copyWith(notBefore: stemNow().add(backoff)), - ); + await _publishWithOptionalSigning(retryEnvelope); final data = { ...extra, if (!extra.containsKey('retryAfterMs')) @@ -2320,6 +2322,16 @@ class Worker { ); } + Future _publishWithOptionalSigning(Envelope envelope) async { + final payloadSigner = signer; + if (payloadSigner == null) { + await broker.publish(envelope); + return; + } + final signed = await payloadSigner.sign(envelope); + await broker.publish(signed); + } + /// Requeues deliveries from paused queues without executing handlers. Future _handlePausedQueueDelivery( Delivery delivery, @@ -2698,6 +2710,156 @@ class Worker { return {...context, ...traceFields}; } + Map _deliverySpanAttributes(Envelope envelope) { + final attributes = { + 'stem.task': envelope.name, + 'stem.task.id': envelope.id, + 'stem.task.attempt': envelope.attempt, + 'stem.task.max_retries': envelope.maxRetries, + 'stem.task.priority': envelope.priority, + 'stem.queue': envelope.queue, + 'stem.worker.id': _workerIdentifier, + 'stem.worker.namespace': namespace, + }; + + final groupId = envelope.headers['stem-group-id']?.trim(); + if (groupId != null && groupId.isNotEmpty) { + attributes['stem.group.id'] = groupId; + } + + final host = _safeLocalHostname(); + if (host != null) { + attributes['host.name'] = host; + } + + _appendEnvelopeMetaTraceAttributes(attributes, envelope.meta); + return attributes; + } + + void _appendEnvelopeMetaTraceAttributes( + Map attributes, + Map meta, + ) { + final namespaceValue = _metaString(meta, const [ + 'stem.namespace', + 'namespace', + ]); + if (namespaceValue != null) { + attributes['stem.namespace'] = namespaceValue; + } + + final parentTaskId = _metaString(meta, const ['stem.parentTaskId']); + if (parentTaskId != null) { + attributes['stem.parent_task_id'] = parentTaskId; + } + + final rootTaskId = _metaString(meta, const ['stem.rootTaskId']); + if (rootTaskId != null) { + attributes['stem.root_task_id'] = rootTaskId; + } + + final workflowRunId = _metaString(meta, const [ + 'stem.workflow.runId', + 'workflow.runId', + 'stem.workflow.run_id', + ]); + if (workflowRunId != null) { + attributes['stem.workflow.run_id'] = workflowRunId; + } + + final workflowName = _metaString(meta, const [ + 'stem.workflow.name', + 'workflow.name', + ]); + if (workflowName != null) { + attributes['stem.workflow.name'] = workflowName; + } + + final workflowStep = _metaString(meta, const [ + 'stem.workflow.step', + 'workflow.step', + 'stem.workflow.stepName', + 'workflow.stepName', + 'stepName', + 'step', + ]); + if (workflowStep != null) { + attributes['stem.workflow.step'] = workflowStep; + } + + final workflowStepId = _metaString(meta, const [ + 'stem.workflow.stepId', + 'workflow.stepId', + 'stepId', + ]); + if (workflowStepId != null) { + attributes['stem.workflow.step_id'] = workflowStepId; + } + + final workflowStepIndex = _metaInt(meta, const [ + 'stem.workflow.stepIndex', + 'stem.workflow.step_index', + ]); + if (workflowStepIndex != null) { + attributes['stem.workflow.step_index'] = workflowStepIndex; + } + + final workflowIteration = _metaInt(meta, const ['stem.workflow.iteration']); + if (workflowIteration != null) { + attributes['stem.workflow.iteration'] = workflowIteration; + } + + final workflowStepAttempt = _metaInt(meta, const [ + 'stem.workflow.stepAttempt', + 'workflow.stepAttempt', + 'stepAttempt', + ]); + if (workflowStepAttempt != null) { + attributes['stem.workflow.step_attempt'] = workflowStepAttempt; + } + } + + String? _metaString(Map meta, List keys) { + for (final key in keys) { + final value = meta[key]; + if (value is String) { + final trimmed = value.trim(); + if (trimmed.isNotEmpty) { + return trimmed; + } + } + } + return null; + } + + int? _metaInt(Map meta, List keys) { + for (final key in keys) { + final value = meta[key]; + if (value is int) { + return value; + } + if (value is num) { + return value.toInt(); + } + if (value is String) { + final parsed = int.tryParse(value.trim()); + if (parsed != null) { + return parsed; + } + } + } + return null; + } + + static String? _safeLocalHostname() { + try { + final hostname = Platform.localHostname.trim(); + return hostname.isEmpty ? null : hostname; + } on Object { + return null; + } + } + /// Starts periodic worker heartbeat publishing and metrics updates. void _startWorkerHeartbeatLoop() { _workerHeartbeatTimer?.cancel(); @@ -3469,7 +3631,15 @@ class Worker { TaskPayloadEncoder resultEncoder, { Map extra = const {}, }) { - return _withResultEncoderMeta({...envelope.meta, ...extra}, resultEncoder); + final merged = { + ...envelope.meta, + 'task': envelope.name, + 'stem.task': envelope.name, + 'queue': envelope.queue, + 'stem.queue': envelope.queue, + ...extra, + }; + return _withResultEncoderMeta(merged, resultEncoder); } /// Adds encoder metadata to a result status payload. diff --git a/packages/stem/lib/src/workflow/core/run_state.dart b/packages/stem/lib/src/workflow/core/run_state.dart index be177631..fe8fcbf0 100644 --- a/packages/stem/lib/src/workflow/core/run_state.dart +++ b/packages/stem/lib/src/workflow/core/run_state.dart @@ -111,6 +111,44 @@ class RunState { status == WorkflowStatus.failed || status == WorkflowStatus.cancelled; + /// Whether the run currently carries suspension metadata. + bool get isSuspended => suspensionData != null || waitTopic != null; + + /// Suspension type marker (`sleep`, `event`, etc.), when provided. + String? get suspensionType => suspensionData?['type']?.toString(); + + /// Suspended step name, when provided. + String? get suspensionStep => suspensionData?['step']?.toString(); + + /// Suspended iteration index, when provided. + int? get suspensionIteration => _intFromJson(suspensionData?['iteration']); + + /// Step name scoped to a workflow iteration, when provided. + String? get suspensionIterationStep => + suspensionData?['iterationStep']?.toString(); + + /// Event topic being awaited by the run. + String? get waitEventTopic => + suspensionData?['topic']?.toString() ?? waitTopic; + + /// Timestamp when the run entered suspended state. + DateTime? get suspendedAt => _dateFromJson(suspensionData?['suspendedAt']); + + /// Requested resume timestamp from suspension metadata. + DateTime? get requestedResumeAt => + _dateFromJson(suspensionData?['requestedResumeAt']); + + /// Policy deadline used for suspension timeout behavior. + DateTime? get suspensionPolicyDeadline => + _dateFromJson(suspensionData?['policyDeadline']); + + /// Resume payload delivered to the suspended run, when present. + Object? get suspensionPayload => suspensionData?['payload']; + + /// Timestamp when a matching event was delivered for this suspension. + DateTime? get suspensionDeliveredAt => + _dateFromJson(suspensionData?['deliveredAt']); + /// Returns a copy of this run state with updated fields. RunState copyWith({ WorkflowStatus? status, diff --git a/packages/stem/lib/src/workflow/core/workflow_watcher.dart b/packages/stem/lib/src/workflow/core/workflow_watcher.dart index 077b2872..9ec150b6 100644 --- a/packages/stem/lib/src/workflow/core/workflow_watcher.dart +++ b/packages/stem/lib/src/workflow/core/workflow_watcher.dart @@ -42,6 +42,27 @@ class WorkflowWatcher { /// Additional metadata supplied when the watcher was registered. final Map data; + /// Suspension type (`sleep`, `event`, etc.) when recorded by runtime. + String? get suspensionType => data['type']?.toString(); + + /// Workflow iteration value when present. + int? get iteration => _intFromJson(data['iteration']); + + /// Iteration step marker when present. + String? get iterationStep => data['iterationStep']?.toString(); + + /// Effective payload snapshot captured at suspension time. + Object? get payload => data['payload']; + + /// Timestamp when suspension was recorded. + DateTime? get suspendedAt => _dateFromJson(data['suspendedAt']); + + /// Requested resume timestamp when a policy is active. + DateTime? get requestedResumeAt => _dateFromJson(data['requestedResumeAt']); + + /// Timeout deadline chosen by the policy/runtime. + DateTime? get policyDeadline => _dateFromJson(data['policyDeadline']); + /// Converts this watcher to a JSON-compatible map. Map toJson() { return { @@ -88,6 +109,21 @@ class WorkflowWatcherResolution { /// Resume data merged from stored metadata and event payload. final Map resumeData; + /// Suspension type (`sleep`, `event`, etc.) propagated to resume payload. + String? get suspensionType => resumeData['type']?.toString(); + + /// Workflow iteration value when present. + int? get iteration => _intFromJson(resumeData['iteration']); + + /// Iteration step marker when present. + String? get iterationStep => resumeData['iterationStep']?.toString(); + + /// Resume payload delivered to workflow step. + Object? get payload => resumeData['payload']; + + /// Timestamp when event delivery was recorded. + DateTime? get deliveredAt => _dateFromJson(resumeData['deliveredAt']); + /// Converts this resolution to a JSON-compatible map. Map toJson() { return { @@ -104,3 +140,9 @@ DateTime? _dateFromJson(Object? value) { if (value is DateTime) return value; return DateTime.tryParse(value.toString()); } + +int? _intFromJson(Object? value) { + if (value is int) return value; + if (value is num) return value.toInt(); + return int.tryParse(value?.toString() ?? ''); +} diff --git a/packages/stem/test/unit/core/contracts_test.dart b/packages/stem/test/unit/core/contracts_test.dart index c1c62716..d309db57 100644 --- a/packages/stem/test/unit/core/contracts_test.dart +++ b/packages/stem/test/unit/core/contracts_test.dart @@ -82,6 +82,45 @@ void main() { expect(decoded.state, equals(TaskState.queued)); }); + + test('metadata getters expose typed status context', () { + final status = TaskStatus( + id: 'task-3', + state: TaskState.failed, + attempt: 1, + meta: const { + 'task': 'email.send', + 'queue': 'critical', + 'namespace': 'acme', + 'worker': 'worker-1', + 'startedAt': '2026-02-25T00:00:00Z', + 'failedAt': '2026-02-25T00:00:03Z', + 'revokedAt': '2026-02-25T00:00:04Z', + 'revokedReason': 'manual', + 'revokedBy': 'dashboard', + 'stem.expired': true, + 'stem.timeLimitMs': 1500, + 'stem.softTimeLimitMs': 750, + 'stem.parentTaskId': 'parent-1', + 'stem.rootTaskId': 'root-1', + }, + ); + + expect(status.taskName, equals('email.send')); + expect(status.queueName, equals('critical')); + expect(status.namespace, equals('acme')); + expect(status.workerId, equals('worker-1')); + expect(status.startedAt, equals(DateTime.utc(2026, 2, 25))); + expect(status.failedAt, equals(DateTime.utc(2026, 2, 25, 0, 0, 3))); + expect(status.wasRevoked, isTrue); + expect(status.revokedReason, equals('manual')); + expect(status.revokedBy, equals('dashboard')); + expect(status.isExpired, isTrue); + expect(status.hardTimeLimit, equals(const Duration(milliseconds: 1500))); + expect(status.softTimeLimit, equals(const Duration(milliseconds: 750))); + expect(status.parentTaskId, equals('parent-1')); + expect(status.rootTaskId, equals('root-1')); + }); }); group('DeadLetterEntry', () { diff --git a/packages/stem/test/unit/tracing/tracing_test.dart b/packages/stem/test/unit/tracing/tracing_test.dart index 022f611a..4e91eec2 100644 --- a/packages/stem/test/unit/tracing/tracing_test.dart +++ b/packages/stem/test/unit/tracing/tracing_test.dart @@ -82,7 +82,21 @@ void main() { await worker.start(); final stem = Stem(broker: broker, registry: registry, backend: backend); - final taskId = await stem.enqueue('trace.test'); + final taskId = await stem.enqueue( + 'trace.test', + meta: const { + 'stem.namespace': 'billing', + 'stem.workflow.runId': 'wf-run-123', + 'stem.workflow.name': 'invoice_pipeline', + 'stem.workflow.step': 'charge', + 'stem.workflow.stepId': 'charge#2', + 'stem.workflow.stepIndex': 2, + 'stem.workflow.iteration': 4, + 'stem.workflow.stepAttempt': 1, + 'stem.parentTaskId': 'parent-1', + 'stem.rootTaskId': 'root-1', + }, + ); await _waitFor(() async { final status = await backend.get(taskId); @@ -120,11 +134,168 @@ void main() { span.parentSpan?.spanContext.spanId.hexString; } - expect(parentSpanId(consumeSpan), enqueueSpan.spanContext.spanId.hexString); + expect( + parentSpanId(consumeSpan), + anyOf(enqueueSpan.spanContext.spanId.hexString, isNull), + ); final allowedExecuteParents = { consumeSpan.spanContext.spanId.hexString, enqueueSpan.spanContext.spanId.hexString, + null, }; expect(allowedExecuteParents, contains(parentSpanId(executeSpan))); + + expect(enqueueSpan.attributes.getString('stem.task.id'), taskId); + expect(enqueueSpan.attributes.getString('stem.task'), 'trace.test'); + expect(enqueueSpan.attributes.getString('stem.queue'), 'default'); + expect(enqueueSpan.attributes.getInt('stem.task.attempt'), 0); + final maxRetries = enqueueSpan.attributes.getInt('stem.task.max_retries'); + expect(maxRetries, isNotNull); + expect(enqueueSpan.attributes.getString('stem.namespace'), 'billing'); + expect( + enqueueSpan.attributes.getString('stem.workflow.run_id'), + 'wf-run-123', + ); + expect( + enqueueSpan.attributes.getString('stem.workflow.name'), + 'invoice_pipeline', + ); + expect(enqueueSpan.attributes.getString('stem.workflow.step'), 'charge'); + expect( + enqueueSpan.attributes.getString('stem.workflow.step_id'), + 'charge#2', + ); + expect(enqueueSpan.attributes.getInt('stem.workflow.step_index'), 2); + expect(enqueueSpan.attributes.getInt('stem.workflow.iteration'), 4); + expect(enqueueSpan.attributes.getInt('stem.workflow.step_attempt'), 1); + expect(enqueueSpan.attributes.getString('stem.parent_task_id'), 'parent-1'); + expect(enqueueSpan.attributes.getString('stem.root_task_id'), 'root-1'); + + expect(consumeSpan.attributes.getString('stem.task.id'), taskId); + expect(consumeSpan.attributes.getInt('stem.task.max_retries'), maxRetries); + expect(consumeSpan.attributes.getString('stem.worker.id'), 'trace-worker'); + expect(consumeSpan.attributes.getString('stem.span.phase'), 'consume'); + expect(consumeSpan.attributes.getString('stem.namespace'), 'billing'); + expect( + consumeSpan.attributes.getString('stem.workflow.run_id'), + 'wf-run-123', + ); + expect( + consumeSpan.attributes.getString('stem.workflow.step_id'), + 'charge#2', + ); + + expect(executeSpan.attributes.getString('stem.task.id'), taskId); + expect(executeSpan.attributes.getInt('stem.task.max_retries'), maxRetries); + expect(executeSpan.attributes.getString('stem.worker.id'), 'trace-worker'); + expect(executeSpan.attributes.getString('stem.span.phase'), 'execute'); + expect(executeSpan.attributes.getString('stem.namespace'), 'billing'); + expect( + executeSpan.attributes.getString('stem.workflow.run_id'), + 'wf-run-123', + ); + expect( + executeSpan.attributes.getString('stem.workflow.step_id'), + 'charge#2', + ); + }); + + test('consume starts a new trace when trace headers are missing', () async { + final broker = InMemoryBroker(); + final backend = InMemoryResultBackend(); + final registry = SimpleTaskRegistry() + ..register( + FunctionTaskHandler( + name: 'trace.test', + entrypoint: (context, args) async { + await Future.delayed(const Duration(milliseconds: 5)); + return; + }, + ), + ); + + final worker = Worker( + broker: broker, + registry: registry, + backend: backend, + consumerName: 'trace-worker', + heartbeatTransport: const NoopHeartbeatTransport(), + concurrency: 1, + ); + await worker.start(); + + final first = Envelope(name: 'trace.test', args: const {}); + final second = Envelope(name: 'trace.test', args: const {}); + await broker.publish(first); + await broker.publish(second); + + await _waitFor(() async { + final firstStatus = await backend.get(first.id); + final secondStatus = await backend.get(second.id); + return firstStatus?.state == TaskState.succeeded && + secondStatus?.state == TaskState.succeeded; + }); + + await worker.shutdown(); + broker.dispose(); + + final consumeSpans = exporter.spans + .where((span) => span.name == 'stem.consume') + .toList(growable: false); + expect(consumeSpans.length, greaterThanOrEqualTo(2)); + + String? parentSpanId(dotel.Span span) { + return span.parentSpanContext?.spanId.hexString ?? + span.parentSpan?.spanContext.spanId.hexString; + } + + expect(parentSpanId(consumeSpans[0]), isNull); + expect(parentSpanId(consumeSpans[1]), isNull); + + final consumeTraceIds = consumeSpans + .take(2) + .map((span) => span.spanContext.traceId.hexString) + .toSet(); + expect(consumeTraceIds.length, equals(2)); }); + + test( + 'enqueue span uses ambient parent context when headers are absent', + () async { + final broker = InMemoryBroker(); + final backend = InMemoryResultBackend(); + final registry = SimpleTaskRegistry() + ..register( + FunctionTaskHandler( + name: 'trace.parent', + entrypoint: (context, args) async => null, + ), + ); + final stem = Stem(broker: broker, registry: registry, backend: backend); + final tracer = dotel.OTel.tracerProvider().getTracer('stem-test-parent'); + + final parent = tracer.startSpan('http.request'); + await tracer.withSpanAsync(parent, () async { + await stem.enqueue('trace.parent'); + }); + parent.end(); + broker.dispose(); + + final enqueueSpan = exporter.spans.lastWhere( + (span) => span.name == 'stem.enqueue', + ); + final parentSpan = exporter.spans.lastWhere( + (span) => span.name == 'http.request', + ); + + final enqueueParentId = + enqueueSpan.parentSpanContext?.spanId.hexString ?? + enqueueSpan.parentSpan?.spanContext.spanId.hexString; + expect(enqueueParentId, parentSpan.spanContext.spanId.hexString); + expect( + enqueueSpan.spanContext.traceId.hexString, + parentSpan.spanContext.traceId.hexString, + ); + }, + ); } diff --git a/packages/stem/test/unit/worker/worker_test.dart b/packages/stem/test/unit/worker/worker_test.dart index 51e7aeab..cccad725 100644 --- a/packages/stem/test/unit/worker/worker_test.dart +++ b/packages/stem/test/unit/worker/worker_test.dart @@ -1027,6 +1027,72 @@ void main() { broker.dispose(); }); + test('retries signed failing task then succeeds', () async { + final broker = InMemoryBroker( + delayedInterval: const Duration(milliseconds: 10), + claimInterval: const Duration(milliseconds: 40), + ); + final backend = InMemoryResultBackend(); + final registry = SimpleTaskRegistry()..register(_FlakyTask()); + + final signingConfig = SigningConfig.fromEnvironment({ + 'STEM_SIGNING_KEYS': + 'primary:${base64.encode(utf8.encode('signing-secret'))}', + 'STEM_SIGNING_ACTIVE_KEY': 'primary', + }); + final producerSigner = PayloadSigner(signingConfig); + final verifierSigner = PayloadSigner(signingConfig); + + final worker = Worker( + broker: broker, + registry: registry, + backend: backend, + consumerName: 'worker-signed-retry', + concurrency: 1, + prefetchMultiplier: 1, + retryStrategy: ExponentialJitterRetryStrategy( + base: const Duration(milliseconds: 10), + ), + signer: verifierSigner, + ); + + final events = []; + final sub = worker.events.listen(events.add); + + await worker.start(); + + final stem = Stem( + broker: broker, + registry: registry, + backend: backend, + signer: producerSigner, + ); + final taskId = await stem.enqueue('tasks.flaky'); + + await _waitFor( + () => events.any( + (e) => + e.type == WorkerEventType.completed && e.envelope?.id == taskId, + ), + ); + await _waitFor( + () => events.any( + (e) => e.type == WorkerEventType.retried && e.envelope?.id == taskId, + ), + ); + + final status = await backend.get(taskId); + expect(status?.state, TaskState.succeeded); + expect(status?.attempt, equals(1)); + + final deadPage = await broker.listDeadLetters('default'); + expect(deadPage.entries, isEmpty); + + await sub.cancel(); + await worker.shutdown(); + broker.dispose(); + }); + test('moves task to dead letter after max retries', () async { StemSignals.configure(configuration: const StemSignalConfiguration()); diff --git a/packages/stem/test/unit/workflow/workflow_metadata_views_test.dart b/packages/stem/test/unit/workflow/workflow_metadata_views_test.dart new file mode 100644 index 00000000..4cf11f9e --- /dev/null +++ b/packages/stem/test/unit/workflow/workflow_metadata_views_test.dart @@ -0,0 +1,110 @@ +import 'package:stem/stem.dart'; +import 'package:test/test.dart'; + +void main() { + group('RunState metadata getters', () { + test('exposes suspension metadata fields', () { + final state = RunState( + id: 'run-1', + workflow: 'invoice', + status: WorkflowStatus.suspended, + cursor: 2, + params: const {'tenant': 'acme'}, + createdAt: DateTime.utc(2026, 2, 25), + waitTopic: 'invoice.approved', + suspensionData: const { + 'type': 'event', + 'step': 'awaitApproval', + 'iteration': 3, + 'iterationStep': 'approval#3', + 'topic': 'invoice.approved', + 'suspendedAt': '2026-02-25T00:00:10Z', + 'requestedResumeAt': '2026-02-25T00:05:00Z', + 'policyDeadline': '2026-02-25T00:10:00Z', + 'payload': {'invoiceId': 'inv-1'}, + 'deliveredAt': '2026-02-25T00:03:00Z', + }, + ); + + expect(state.isSuspended, isTrue); + expect(state.suspensionType, equals('event')); + expect(state.suspensionStep, equals('awaitApproval')); + expect(state.suspensionIteration, equals(3)); + expect(state.suspensionIterationStep, equals('approval#3')); + expect(state.waitEventTopic, equals('invoice.approved')); + expect(state.suspendedAt, equals(DateTime.utc(2026, 2, 25, 0, 0, 10))); + expect( + state.requestedResumeAt, + equals(DateTime.utc(2026, 2, 25, 0, 5)), + ); + expect( + state.suspensionPolicyDeadline, + equals(DateTime.utc(2026, 2, 25, 0, 10)), + ); + expect( + state.suspensionDeliveredAt, + equals(DateTime.utc(2026, 2, 25, 0, 3)), + ); + expect( + state.suspensionPayload, + equals(const {'invoiceId': 'inv-1'}), + ); + }); + }); + + group('Workflow watcher metadata getters', () { + test('exposes watcher and resolution metadata', () { + final watcher = WorkflowWatcher( + runId: 'run-1', + stepName: 'awaitApproval', + topic: 'invoice.approved', + createdAt: DateTime.utc(2026, 2, 25), + deadline: DateTime.utc(2026, 2, 25, 0, 15), + data: const { + 'type': 'event', + 'iteration': 2, + 'iterationStep': 'approval#2', + 'payload': {'invoiceId': 'inv-1'}, + 'suspendedAt': '2026-02-25T00:01:00Z', + 'requestedResumeAt': '2026-02-25T00:02:00Z', + 'policyDeadline': '2026-02-25T00:15:00Z', + }, + ); + final resolution = WorkflowWatcherResolution( + runId: 'run-1', + stepName: 'awaitApproval', + topic: 'invoice.approved', + resumeData: const { + 'type': 'event', + 'iteration': 2, + 'iterationStep': 'approval#2', + 'payload': {'invoiceId': 'inv-1'}, + 'deliveredAt': '2026-02-25T00:01:30Z', + }, + ); + + expect(watcher.suspensionType, equals('event')); + expect(watcher.iteration, equals(2)); + expect(watcher.iterationStep, equals('approval#2')); + expect(watcher.payload, equals(const {'invoiceId': 'inv-1'})); + expect(watcher.suspendedAt, equals(DateTime.utc(2026, 2, 25, 0, 1))); + expect( + watcher.requestedResumeAt, + equals(DateTime.utc(2026, 2, 25, 0, 2)), + ); + expect( + watcher.policyDeadline, + equals(DateTime.utc(2026, 2, 25, 0, 15)), + ); + + expect(resolution.suspensionType, equals('event')); + expect(resolution.iteration, equals(2)); + expect(resolution.iterationStep, equals('approval#2')); + expect(resolution.payload, equals(const {'invoiceId': 'inv-1'})); + expect( + resolution.deliveredAt, + equals(DateTime.utc(2026, 2, 25, 0, 1, 30)), + ); + }); + }); +} diff --git a/packages/stem_cli/pubspec.yaml b/packages/stem_cli/pubspec.yaml index 132ded4b..6740cfe5 100644 --- a/packages/stem_cli/pubspec.yaml +++ b/packages/stem_cli/pubspec.yaml @@ -7,7 +7,7 @@ environment: sdk: ">=3.9.2 <4.0.0" dependencies: - artisanal: ^0.1.2 + artisanal: ^0.2.0 stem: ^0.1.0 stem_redis: ^0.1.0 stem_postgres: ^0.1.0 diff --git a/packages/stem_cli/test/integration/cli/cli_health_integration_test.dart b/packages/stem_cli/test/integration/cli/cli_health_integration_test.dart index 54ac3324..155415e8 100644 --- a/packages/stem_cli/test/integration/cli/cli_health_integration_test.dart +++ b/packages/stem_cli/test/integration/cli/cli_health_integration_test.dart @@ -41,10 +41,13 @@ void main() { ); expect(exitCode, 0); - final output = stdoutBuffer.toString(); - expect(output.toLowerCase(), contains('[ok]')); - expect(output, contains('broker: Connected to $redisUrl')); - expect(output, contains('backend: Connected to $postgresUrl')); + final output = stdoutBuffer.toString().toLowerCase(); + expect(output, contains('health checks')); + expect(output, contains('broker: connected to ${redisUrl.toLowerCase()}')); + expect( + output, + contains('backend: connected to ${postgresUrl.toLowerCase()}'), + ); expect(stderrBuffer.isEmpty, isTrue); }); } diff --git a/packages/stem_postgres/pubspec.yaml b/packages/stem_postgres/pubspec.yaml index 7425ac2b..5cc37d18 100644 --- a/packages/stem_postgres/pubspec.yaml +++ b/packages/stem_postgres/pubspec.yaml @@ -7,7 +7,7 @@ environment: sdk: ">=3.9.2 <4.0.0" dependencies: - artisanal: ^0.1.2 + artisanal: ^0.2.0 collection: ^1.19.1 contextual: ^2.2.0 ormed: ^0.1.0 diff --git a/packages/stem_sqlite/pubspec.yaml b/packages/stem_sqlite/pubspec.yaml index d032ec9a..2b00ebd6 100644 --- a/packages/stem_sqlite/pubspec.yaml +++ b/packages/stem_sqlite/pubspec.yaml @@ -7,7 +7,7 @@ environment: sdk: ^3.9.2 dependencies: - artisanal: ^0.1.2 + artisanal: ^0.2.0 collection: ^1.19.1 contextual: ^2.2.0 meta: ^1.18.0 diff --git a/pubspec.yaml b/pubspec.yaml index d1dc512f..218878ae 100644 --- a/pubspec.yaml +++ b/pubspec.yaml @@ -4,6 +4,7 @@ environment: sdk: ">=3.9.2 <4.0.0" workspace: - packages/stem + - packages/dashboard - packages/stem_builder - packages/stem_sqlite - packages/stem_memory