From 35e8392049cafdb48de0411d9134fbfd6d5d317a Mon Sep 17 00:00:00 2001 From: kingwill101 Date: Tue, 24 Feb 2026 17:08:29 -0500 Subject: [PATCH 01/14] feat: refactor dashboard content structure and move page rendering logic to dedicated files - Moved event rendering logic to `events.dart` and created `buildEventsContent` function. - Created `options.dart` for task and worker page options classes. - Moved overview content rendering to `overview.dart` with `buildOverviewContent` function. - Added shared utility functions to `shared.dart` for metric cards and queue table rows. - Implemented task rendering in `tasks.dart` with `buildTasksContent` function. - Created worker rendering logic in `workers.dart` with `buildWorkersContent` function. --- packages/dashboard/lib/src/ui/content.dart | 583 +------------------- packages/dashboard/lib/src/ui/events.dart | 33 ++ packages/dashboard/lib/src/ui/options.dart | 53 ++ packages/dashboard/lib/src/ui/overview.dart | 70 +++ packages/dashboard/lib/src/ui/shared.dart | 85 +++ packages/dashboard/lib/src/ui/tasks.dart | 164 ++++++ packages/dashboard/lib/src/ui/workers.dart | 190 +++++++ 7 files changed, 605 insertions(+), 573 deletions(-) create mode 100644 packages/dashboard/lib/src/ui/events.dart create mode 100644 packages/dashboard/lib/src/ui/options.dart create mode 100644 packages/dashboard/lib/src/ui/overview.dart create mode 100644 packages/dashboard/lib/src/ui/shared.dart create mode 100644 packages/dashboard/lib/src/ui/tasks.dart create mode 100644 packages/dashboard/lib/src/ui/workers.dart diff --git a/packages/dashboard/lib/src/ui/content.dart b/packages/dashboard/lib/src/ui/content.dart index b5f3ba7..8ed220b 100644 --- a/packages/dashboard/lib/src/ui/content.dart +++ b/packages/dashboard/lib/src/ui/content.dart @@ -1,67 +1,12 @@ -import 'package:intl/intl.dart'; -import 'package:stem/stem.dart' show stemNow; -// HTML template strings are kept on single lines for readability. -// ignore_for_file: lines_longer_than_80_chars - import 'package:stem_dashboard/src/services/models.dart'; -import 'package:stem_dashboard/src/ui/event_templates.dart'; +import 'package:stem_dashboard/src/ui/events.dart'; import 'package:stem_dashboard/src/ui/layout.dart'; +import 'package:stem_dashboard/src/ui/options.dart'; +import 'package:stem_dashboard/src/ui/overview.dart'; +import 'package:stem_dashboard/src/ui/tasks.dart'; +import 'package:stem_dashboard/src/ui/workers.dart'; -final _numberFormat = NumberFormat.decimalPattern(); - -/// View options used by the tasks page renderer. -class TasksPageOptions { - /// Creates task page options with optional overrides. - const TasksPageOptions({ - this.sortKey = 'queue', - this.descending = false, - this.filter, - this.flashKey, - this.errorKey, - }); - - /// Sort key used for queue ordering. - final String sortKey; - - /// Whether sorting should be descending. - final bool descending; - - /// Optional queue filter text. - final String? filter; - - /// Optional flash message key for UI alerts. - final String? flashKey; - - /// Optional error message key for UI alerts. - final String? errorKey; - - /// Whether a non-empty filter value is set. - bool get hasFilter => filter != null && filter!.isNotEmpty; -} - -/// View options used by the workers page renderer. -class WorkersPageOptions { - /// Creates worker page options with optional overrides. - const WorkersPageOptions({this.flashMessage, this.errorMessage, this.scope}); - - /// Optional flash message for the UI. - final String? flashMessage; - - /// Optional error message for the UI. - final String? errorMessage; - - /// Optional worker scope filter. - final String? scope; - - /// Whether a non-empty flash message is set. - bool get hasFlash => flashMessage != null && flashMessage!.isNotEmpty; - - /// Whether a non-empty error message is set. - bool get hasError => errorMessage != null && errorMessage!.isNotEmpty; - - /// Whether a non-empty scope value is set. - bool get hasScope => scope != null && scope!.isNotEmpty; -} +export 'package:stem_dashboard/src/ui/options.dart'; /// Builds the HTML for the specified dashboard [page]. String buildPageContent({ @@ -75,520 +20,12 @@ String buildPageContent({ }) { switch (page) { case DashboardPage.overview: - return _overviewContent(queues, workers, throughput); + return buildOverviewContent(queues, workers, throughput); case DashboardPage.tasks: - return _tasksContent(queues, tasksOptions); + return buildTasksContent(queues, tasksOptions); case DashboardPage.events: - return _eventsContent(events); + return buildEventsContent(events); case DashboardPage.workers: - return _workersContent(workers, queues, workersOptions); - } -} - -String _overviewContent( - List queues, - List workers, - DashboardThroughput? throughput, -) { - final totalPending = queues.fold( - 0, - (total, summary) => total + summary.pending, - ); - final totalInflight = queues.fold( - 0, - (total, summary) => total + summary.inflight, - ); - final totalDead = queues.fold( - 0, - (total, summary) => total + summary.deadLetters, - ); - final activeWorkers = workers.length; - final busiest = List.of( - queues, - )..sort((a, b) => (b.pending + b.inflight).compareTo(a.pending + a.inflight)); - final topQueues = busiest.take(5).toList(); - - final processedPerMin = throughput?.processedPerMinute ?? 0; - final enqueuedPerMin = throughput?.enqueuedPerMinute ?? 0; - final throughputHint = throughput == null - ? 'Waiting for another snapshot to estimate rate.' - : 'Net change over the last ${throughput.interval.inSeconds}s.'; - - return ''' - - -
- ${_metricCard('Backlog (lag)', _formatInt(totalPending), 'Undelivered tasks waiting across all queues.')} - ${_metricCard('Processing', _formatInt(totalInflight), 'Active envelopes currently being executed.')} - ${_metricCard('Processed / min', _formatRate(processedPerMin), throughputHint)} - ${_metricCard('Enqueued / min', _formatRate(enqueuedPerMin), throughputHint)} - ${_metricCard('Dead letters', _formatInt(totalDead), 'Items held in dead letter queues.')} - ${_metricCard('Active workers', _formatInt(activeWorkers), 'Workers that published heartbeats within the retention window.')} -
- -
- - - - - - - - - - - ${topQueues.isEmpty ? _emptyQueuesRow('No queues detected yet.') : topQueues.map(_queueTableRow).join()} - -
QueuePendingIn-flightDead letters
-
-'''; -} - -String _tasksContent(List queues, TasksPageOptions options) { - var filtered = - options.hasFilter - ? queues - .where( - (summary) => summary.queue.toLowerCase().contains( - options.filter!.toLowerCase(), - ), - ) - .toList() - : List.of(queues) - ..sort((a, b) => _compareQueues(a, b, options)); - if (options.descending) { - filtered = filtered.reversed.toList(); - } - - final totalQueues = filtered.length; - final dlqTotal = filtered.fold( - 0, - (total, summary) => total + summary.deadLetters, - ); - - return ''' - - -${_renderTasksAlert(options)} - -
- ${_metricCard('Tracked queues', _formatInt(totalQueues), 'Queues discovered via Redis stream prefixes.')} - ${_metricCard('Dead letter size', _formatInt(dlqTotal), 'Aggregate items across all dead letter queues.')} -
- -
- - - - - - ${options.hasFilter ? 'Clear' : ''} -
- -
- - - - - - - - - - - ${filtered.isEmpty ? _emptyQueuesRow('No streams found for the configured namespace.') : filtered.map(_queueTableRow).join()} - -
${_sortableHeader('Queue', 'queue', options)}${_sortableHeader('Pending', 'pending', options)}${_sortableHeader('In-flight', 'inflight', options)}${_sortableHeader('Dead letters', 'dead', options)}
-
- -
-
-

Ad-hoc enqueue

-
-
- - - - - -
- -
-
-
-'''; -} - -String _formatRate(double value) { - if (value <= 0) return '0'; - if (value < 1) return value.toStringAsFixed(2); - return _numberFormat.format(value.round()); -} - -String _eventsContent(List events) { - final items = events.isEmpty - ? ''' -
-

No events captured yet

-

- Configure the dashboard event bridge to stream Stem signals (enqueue, start, retry, completion) into Redis. - Once connected, updates will appear here automatically via Turbo Streams. -

-
- ''' - : events.map(renderEventItem).join(); - - return ''' - - -
- $items -
-'''; -} - -String _workersContent( - List workers, - List queues, - WorkersPageOptions options, -) { - final healthyWorkers = workers.where((worker) { - return worker.age <= const Duration(minutes: 2); - }).length; - - final busy = workers.where((worker) => worker.inflight > 0).length; - final queueMap = {for (final summary in queues) summary.queue: summary}; - - return ''' - - -${_renderWorkersAlert(options)} - -
- ${_metricCard('Healthy workers', _formatInt(healthyWorkers), 'Heartbeats received within the last two minutes.')} - ${_metricCard('Busy workers', _formatInt(busy), 'Workers currently processing at least one task.')} - ${_metricCard('Isolates in use', _formatInt(_totalIsolates(workers)), 'Sum of worker isolates across the cluster.')} -
- -
- - - - - - - - - - - - ${workers.isEmpty ? ''' - - - - ''' : workers.map(_workerRow).join()} - -
WorkerQueuesInflightLast heartbeatActions
No heartbeats detected for namespace "${workers.isEmpty ? 'stem' : workers.first.namespace}".
-
- -${_clusterControls()} - -${_queueRecoverySection(queueMap)} -'''; -} - -String _queueTableRow(QueueSummary summary) { - return ''' - - ${summary.queue} - ${_formatInt(summary.pending)} - ${_formatInt(summary.inflight)} - ${_formatInt(summary.deadLetters)} - - - -
-
Pending ${_formatInt(summary.pending)}
-
In-flight ${_formatInt(summary.inflight)}
-
Dead letters ${_formatInt(summary.deadLetters)}
-
Detailed DLQ previews render here once the replay control is wired.
-
- - -'''; -} - -String _workerRow(WorkerStatus status) { - final queues = status.queues.isEmpty - ? '' - : status.queues - .map((queue) => '${queue.name}') - .join(' '); - return ''' - - ${status.workerId} - $queues - ${_formatInt(status.inflight)} - ${_formatRelative(status.timestamp)} - -
- ${_workerActionButton('Ping', 'ping', status.workerId)} - ${_workerActionButton('Pause', 'pause', status.workerId)} - ${_workerActionButton('Shutdown', 'shutdown', status.workerId)} -
- - -'''; -} - -String _workerActionButton(String label, String action, String workerId) { - return ''' -
- - - -
-'''; -} - -String _clusterControls() { - return ''' -
-

Cluster controls

-
- ${_clusterActionButton('Ping all workers', 'ping')} - ${_clusterActionButton('Pause all workers', 'pause')} - ${_clusterActionButton('Shutdown all workers', 'shutdown')} -
-
-'''; -} - -String _clusterActionButton(String label, String action) { - return ''' -
- - - -
-'''; -} - -String _queueRecoverySection(Map queues) { - if (queues.isEmpty) return ''; - final rows = queues.values.toList() - ..sort((a, b) => a.queue.compareTo(b.queue)); - return ''' -
- - - - - - - - - - - ${rows.map(_queueRecoveryRow).join()} - -
QueuePendingDead lettersReplay
-
-'''; -} - -String _queueRecoveryRow(QueueSummary summary) { - final limitDefault = summary.deadLetters <= 0 - ? 50 - : summary.deadLetters.clamp(1, 50); - final action = summary.deadLetters == 0 - ? 'No dead letters' - : ''' -
- - - -
- '''; - return ''' - - ${_escapeHtml(summary.queue)} - ${_formatInt(summary.pending)} - ${_formatInt(summary.deadLetters)} - $action - -'''; -} - -String _metricCard(String title, String value, String caption) { - return ''' -
-
$title
-
$value
-

$caption

-
-'''; -} - -String _emptyQueuesRow(String message) { - return ''' - - $message - -'''; -} - -String _renderTasksAlert(TasksPageOptions options) { - String? message; - var type = 'success'; - switch (options.flashKey) { - case 'queued': - message = 'Task enqueued successfully.'; - } - switch (options.errorKey) { - case 'missing-fields': - message = 'Queue and task name are required.'; - type = 'error'; - case 'invalid-payload': - message = 'Payload must be valid JSON describing an object.'; - type = 'error'; - case 'enqueue-failed': - message = - 'Failed to enqueue the task. Check the dashboard logs for details.'; - type = 'error'; - } - - if (message == null) return ''; - return '
${_escapeHtml(message)}
'; -} - -String _renderWorkersAlert(WorkersPageOptions options) { - if (options.hasError) { - final scope = options.hasScope - ? '
Target: ${_escapeHtml(options.scope!)}.
' - : ''; - return ''' -
- ${_escapeHtml(options.errorMessage!)} - $scope -
-'''; - } - if (options.hasFlash) { - final scope = options.hasScope - ? '
Target: ${_escapeHtml(options.scope!)}.
' - : ''; - return ''' -
- ${_escapeHtml(options.flashMessage!)} - $scope -
-'''; - } - return ''; -} - -int _compareQueues(QueueSummary a, QueueSummary b, TasksPageOptions options) { - switch (options.sortKey) { - case 'pending': - return a.pending.compareTo(b.pending); - case 'inflight': - return a.inflight.compareTo(b.inflight); - case 'dead': - return a.deadLetters.compareTo(b.deadLetters); - case 'queue': - default: - return a.queue.toLowerCase().compareTo(b.queue.toLowerCase()); - } -} - -String _sortableHeader(String label, String key, TasksPageOptions options) { - final isActive = options.sortKey == key; - final descendingNext = isActive ? !options.descending : key != 'queue'; - final params = { - 'sort': key, - 'direction': descendingNext ? 'desc' : 'asc', - }; - if (options.hasFilter) { - params['queue'] = options.filter!; - } - final query = _buildQuery(params); - final indicator = isActive ? (options.descending ? '↓' : '↑') : ''; - final classes = isActive ? 'sort-link active' : 'sort-link'; - return '$label $indicator'; -} - -String _buildQuery(Map params) { - return params.entries - .map( - (entry) => - '${Uri.encodeQueryComponent(entry.key)}=${Uri.encodeQueryComponent(entry.value)}', - ) - .join('&'); -} - -String _escapeHtml(String value) { - return value - .replaceAll('&', '&') - .replaceAll('<', '<') - .replaceAll('>', '>') - .replaceAll('"', '"') - .replaceAll("'", '''); -} - -int _totalIsolates(List workers) { - return workers.fold(0, (total, status) => total + status.isolateCount); -} - -String _formatInt(int value) => _numberFormat.format(value); - -String _formatRelative(DateTime timestamp) { - final now = stemNow().toUtc(); - final diff = now.difference(timestamp.toUtc()); - if (diff < const Duration(seconds: 30)) return 'just now'; - if (diff < const Duration(minutes: 1)) { - return '${diff.inSeconds}s ago'; - } - if (diff < const Duration(hours: 1)) { - return '${diff.inMinutes}m ago'; - } - if (diff < const Duration(days: 1)) { - return '${diff.inHours}h ago'; + return buildWorkersContent(workers, queues, workersOptions); } - return '${diff.inDays}d ago'; } diff --git a/packages/dashboard/lib/src/ui/events.dart b/packages/dashboard/lib/src/ui/events.dart new file mode 100644 index 0000000..98b0e16 --- /dev/null +++ b/packages/dashboard/lib/src/ui/events.dart @@ -0,0 +1,33 @@ +// Public helper functions in this file are intentionally undocumented to keep +// UI template files lightweight. +// ignore_for_file: public_member_api_docs + +import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/event_templates.dart'; + +String buildEventsContent(List events) { + final items = events.isEmpty + ? ''' +
+

No events captured yet

+

+ Configure the dashboard event bridge to stream Stem signals (enqueue, start, retry, completion) into Redis. + Once connected, updates will appear here automatically via Turbo Streams. +

+
+ ''' + : events.map(renderEventItem).join(); + + return ''' + + +
+ $items +
+'''; +} diff --git a/packages/dashboard/lib/src/ui/options.dart b/packages/dashboard/lib/src/ui/options.dart new file mode 100644 index 0000000..de523c5 --- /dev/null +++ b/packages/dashboard/lib/src/ui/options.dart @@ -0,0 +1,53 @@ +/// View options used by the tasks page renderer. +class TasksPageOptions { + /// Creates task page options with optional overrides. + const TasksPageOptions({ + this.sortKey = 'queue', + this.descending = false, + this.filter, + this.flashKey, + this.errorKey, + }); + + /// Sort key used for queue ordering. + final String sortKey; + + /// Whether sorting should be descending. + final bool descending; + + /// Optional queue filter text. + final String? filter; + + /// Optional flash message key for UI alerts. + final String? flashKey; + + /// Optional error message key for UI alerts. + final String? errorKey; + + /// Whether a non-empty filter value is set. + bool get hasFilter => filter != null && filter!.isNotEmpty; +} + +/// View options used by the workers page renderer. +class WorkersPageOptions { + /// Creates worker page options with optional overrides. + const WorkersPageOptions({this.flashMessage, this.errorMessage, this.scope}); + + /// Optional flash message for the UI. + final String? flashMessage; + + /// Optional error message for the UI. + final String? errorMessage; + + /// Optional worker scope filter. + final String? scope; + + /// Whether a non-empty flash message is set. + bool get hasFlash => flashMessage != null && flashMessage!.isNotEmpty; + + /// Whether a non-empty error message is set. + bool get hasError => errorMessage != null && errorMessage!.isNotEmpty; + + /// Whether a non-empty scope value is set. + bool get hasScope => scope != null && scope!.isNotEmpty; +} diff --git a/packages/dashboard/lib/src/ui/overview.dart b/packages/dashboard/lib/src/ui/overview.dart new file mode 100644 index 0000000..0926312 --- /dev/null +++ b/packages/dashboard/lib/src/ui/overview.dart @@ -0,0 +1,70 @@ +// Public helper functions in this file are intentionally undocumented to keep +// UI template files lightweight. +// ignore_for_file: public_member_api_docs + +import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/shared.dart'; + +String buildOverviewContent( + List queues, + List workers, + DashboardThroughput? throughput, +) { + final totalPending = queues.fold( + 0, + (total, summary) => total + summary.pending, + ); + final totalInflight = queues.fold( + 0, + (total, summary) => total + summary.inflight, + ); + final totalDead = queues.fold( + 0, + (total, summary) => total + summary.deadLetters, + ); + final activeWorkers = workers.length; + final busiest = List.of( + queues, + )..sort((a, b) => (b.pending + b.inflight).compareTo(a.pending + a.inflight)); + final topQueues = busiest.take(5).toList(); + + final processedPerMin = throughput?.processedPerMinute ?? 0; + final enqueuedPerMin = throughput?.enqueuedPerMinute ?? 0; + final throughputHint = throughput == null + ? 'Waiting for another snapshot to estimate rate.' + : 'Net change over the last ${throughput.interval.inSeconds}s.'; + + return ''' + + +
+ ${buildMetricCard('Backlog (lag)', formatInt(totalPending), 'Undelivered tasks waiting across all queues.')} + ${buildMetricCard('Processing', formatInt(totalInflight), 'Active envelopes currently being executed.')} + ${buildMetricCard('Processed / min', formatRate(processedPerMin), throughputHint)} + ${buildMetricCard('Enqueued / min', formatRate(enqueuedPerMin), throughputHint)} + ${buildMetricCard('Dead letters', formatInt(totalDead), 'Items held in dead letter queues.')} + ${buildMetricCard('Active workers', formatInt(activeWorkers), 'Workers that published heartbeats within the retention window.')} +
+ +
+ + + + + + + + + + + ${topQueues.isEmpty ? buildEmptyQueuesRow('No queues detected yet.') : topQueues.map(buildQueueTableRow).join()} + +
QueuePendingIn-flightDead letters
+
+'''; +} diff --git a/packages/dashboard/lib/src/ui/shared.dart b/packages/dashboard/lib/src/ui/shared.dart new file mode 100644 index 0000000..8b9cddb --- /dev/null +++ b/packages/dashboard/lib/src/ui/shared.dart @@ -0,0 +1,85 @@ +import 'package:intl/intl.dart'; +import 'package:stem/stem.dart' show stemNow; +// Public helper functions in this file are intentionally undocumented to keep +// UI template files lightweight. +// ignore_for_file: public_member_api_docs + +import 'package:stem_dashboard/src/services/models.dart'; + +final dashboardNumberFormat = NumberFormat.decimalPattern(); + +String buildQueueTableRow(QueueSummary summary) { + return ''' + + ${summary.queue} + ${formatInt(summary.pending)} + ${formatInt(summary.inflight)} + ${formatInt(summary.deadLetters)} + + + +
+
Pending ${formatInt(summary.pending)}
+
In-flight ${formatInt(summary.inflight)}
+
Dead letters ${formatInt(summary.deadLetters)}
+
Detailed DLQ previews render here once the replay control is wired.
+
+ + +'''; +} + +String buildMetricCard(String title, String value, String caption) { + return ''' +
+
$title
+
$value
+

$caption

+
+'''; +} + +String buildEmptyQueuesRow(String message) { + return ''' + + $message + +'''; +} + +String escapeHtml(String value) { + return value + .replaceAll('&', '&') + .replaceAll('<', '<') + .replaceAll('>', '>') + .replaceAll('"', '"') + .replaceAll("'", '''); +} + +int totalIsolates(List workers) { + return workers.fold(0, (total, status) => total + status.isolateCount); +} + +String formatInt(int value) => dashboardNumberFormat.format(value); + +String formatRate(double value) { + if (value <= 0) return '0'; + if (value < 1) return value.toStringAsFixed(2); + return dashboardNumberFormat.format(value.round()); +} + +String formatRelative(DateTime timestamp) { + final now = stemNow().toUtc(); + final diff = now.difference(timestamp.toUtc()); + if (diff < const Duration(seconds: 30)) return 'just now'; + if (diff < const Duration(minutes: 1)) { + return '${diff.inSeconds}s ago'; + } + if (diff < const Duration(hours: 1)) { + return '${diff.inMinutes}m ago'; + } + if (diff < const Duration(days: 1)) { + return '${diff.inHours}h ago'; + } + return '${diff.inDays}d ago'; +} diff --git a/packages/dashboard/lib/src/ui/tasks.dart b/packages/dashboard/lib/src/ui/tasks.dart new file mode 100644 index 0000000..b535768 --- /dev/null +++ b/packages/dashboard/lib/src/ui/tasks.dart @@ -0,0 +1,164 @@ +// HTML template strings are kept on single lines for readability. +// ignore_for_file: lines_longer_than_80_chars, public_member_api_docs + +import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/options.dart'; +import 'package:stem_dashboard/src/ui/shared.dart'; + +String buildTasksContent(List queues, TasksPageOptions options) { + var filtered = + options.hasFilter + ? queues + .where( + (summary) => summary.queue.toLowerCase().contains( + options.filter!.toLowerCase(), + ), + ) + .toList() + : List.of(queues) + ..sort((a, b) => compareQueues(a, b, options)); + if (options.descending) { + filtered = filtered.reversed.toList(); + } + + final totalQueues = filtered.length; + final dlqTotal = filtered.fold( + 0, + (total, summary) => total + summary.deadLetters, + ); + + return ''' + + +${renderTasksAlert(options)} + +
+ ${buildMetricCard('Tracked queues', formatInt(totalQueues), 'Queues discovered via Redis stream prefixes.')} + ${buildMetricCard('Dead letter size', formatInt(dlqTotal), 'Aggregate items across all dead letter queues.')} +
+ +
+ + + + + + ${options.hasFilter ? 'Clear' : ''} +
+ +
+ + + + + + + + + + + ${filtered.isEmpty ? buildEmptyQueuesRow('No streams found for the configured namespace.') : filtered.map(buildQueueTableRow).join()} + +
${buildSortableHeader('Queue', 'queue', options)}${buildSortableHeader('Pending', 'pending', options)}${buildSortableHeader('In-flight', 'inflight', options)}${buildSortableHeader('Dead letters', 'dead', options)}
+
+ +
+
+

Ad-hoc enqueue

+
+
+ + + + + +
+ +
+
+
+'''; +} + +String renderTasksAlert(TasksPageOptions options) { + String? message; + var type = 'success'; + switch (options.flashKey) { + case 'queued': + message = 'Task enqueued successfully.'; + } + switch (options.errorKey) { + case 'missing-fields': + message = 'Queue and task name are required.'; + type = 'error'; + case 'invalid-payload': + message = 'Payload must be valid JSON describing an object.'; + type = 'error'; + case 'enqueue-failed': + message = + 'Failed to enqueue the task. Check the dashboard logs for details.'; + type = 'error'; + } + + if (message == null) return ''; + return '
${escapeHtml(message)}
'; +} + +int compareQueues(QueueSummary a, QueueSummary b, TasksPageOptions options) { + switch (options.sortKey) { + case 'pending': + return a.pending.compareTo(b.pending); + case 'inflight': + return a.inflight.compareTo(b.inflight); + case 'dead': + return a.deadLetters.compareTo(b.deadLetters); + case 'queue': + default: + return a.queue.toLowerCase().compareTo(b.queue.toLowerCase()); + } +} + +String buildSortableHeader(String label, String key, TasksPageOptions options) { + final isActive = options.sortKey == key; + final descendingNext = isActive ? !options.descending : key != 'queue'; + final params = { + 'sort': key, + 'direction': descendingNext ? 'desc' : 'asc', + }; + if (options.hasFilter) { + params['queue'] = options.filter!; + } + final query = buildQuery(params); + final indicator = isActive ? (options.descending ? '↓' : '↑') : ''; + final classes = isActive ? 'sort-link active' : 'sort-link'; + return '$label $indicator'; +} + +String buildQuery(Map params) { + return params.entries + .map( + (entry) => + '${Uri.encodeQueryComponent(entry.key)}=${Uri.encodeQueryComponent(entry.value)}', + ) + .join('&'); +} diff --git a/packages/dashboard/lib/src/ui/workers.dart b/packages/dashboard/lib/src/ui/workers.dart new file mode 100644 index 0000000..2f4a840 --- /dev/null +++ b/packages/dashboard/lib/src/ui/workers.dart @@ -0,0 +1,190 @@ +// Public helper functions in this file are intentionally undocumented to keep +// UI template files lightweight. +// ignore_for_file: public_member_api_docs + +import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/options.dart'; +import 'package:stem_dashboard/src/ui/shared.dart'; + +String buildWorkersContent( + List workers, + List queues, + WorkersPageOptions options, +) { + final healthyWorkers = workers.where((worker) { + return worker.age <= const Duration(minutes: 2); + }).length; + + final busy = workers.where((worker) => worker.inflight > 0).length; + final queueMap = {for (final summary in queues) summary.queue: summary}; + + return ''' + + +${renderWorkersAlert(options)} + +
+ ${buildMetricCard('Healthy workers', formatInt(healthyWorkers), 'Heartbeats received within the last two minutes.')} + ${buildMetricCard('Busy workers', formatInt(busy), 'Workers currently processing at least one task.')} + ${buildMetricCard('Isolates in use', formatInt(totalIsolates(workers)), 'Sum of worker isolates across the cluster.')} +
+ +
+ + + + + + + + + + + + ${workers.isEmpty ? ''' + + + + ''' : workers.map(buildWorkerRow).join()} + +
WorkerQueuesInflightLast heartbeatActions
No heartbeats detected for namespace "${workers.isEmpty ? 'stem' : workers.first.namespace}".
+
+ +${buildClusterControls()} + +${buildQueueRecoverySection(queueMap)} +'''; +} + +String buildWorkerRow(WorkerStatus status) { + final queues = status.queues.isEmpty + ? '' + : status.queues + .map((queue) => '${queue.name}') + .join(' '); + return ''' + + ${status.workerId} + $queues + ${formatInt(status.inflight)} + ${formatRelative(status.timestamp)} + +
+ ${buildWorkerActionButton('Ping', 'ping', status.workerId)} + ${buildWorkerActionButton('Pause', 'pause', status.workerId)} + ${buildWorkerActionButton('Shutdown', 'shutdown', status.workerId)} +
+ + +'''; +} + +String buildWorkerActionButton(String label, String action, String workerId) { + return ''' +
+ + + +
+'''; +} + +String buildClusterControls() { + return ''' +
+

Cluster controls

+
+ ${buildClusterActionButton('Ping all workers', 'ping')} + ${buildClusterActionButton('Pause all workers', 'pause')} + ${buildClusterActionButton('Shutdown all workers', 'shutdown')} +
+
+'''; +} + +String buildClusterActionButton(String label, String action) { + return ''' +
+ + + +
+'''; +} + +String buildQueueRecoverySection(Map queues) { + if (queues.isEmpty) return ''; + final rows = queues.values.toList() + ..sort((a, b) => a.queue.compareTo(b.queue)); + return ''' +
+ + + + + + + + + + + ${rows.map(buildQueueRecoveryRow).join()} + +
QueuePendingDead lettersReplay
+
+'''; +} + +String buildQueueRecoveryRow(QueueSummary summary) { + final limitDefault = summary.deadLetters <= 0 + ? 50 + : summary.deadLetters.clamp(1, 50); + final action = summary.deadLetters == 0 + ? 'No dead letters' + : ''' +
+ + + +
+ '''; + return ''' + + ${escapeHtml(summary.queue)} + ${formatInt(summary.pending)} + ${formatInt(summary.deadLetters)} + $action + +'''; +} + +String renderWorkersAlert(WorkersPageOptions options) { + if (options.hasError) { + final scope = options.hasScope + ? '
Target: ${escapeHtml(options.scope!)}.
' + : ''; + return ''' +
+ ${escapeHtml(options.errorMessage!)} + $scope +
+'''; + } + if (options.hasFlash) { + final scope = options.hasScope + ? '
Target: ${escapeHtml(options.scope!)}.
' + : ''; + return ''' +
+ ${escapeHtml(options.flashMessage!)} + $scope +
+'''; + } + return ''; +} From a72f7a975c5ecb0f3d06a62f9d006c1d1c2d6140 Mon Sep 17 00:00:00 2001 From: kingwill101 Date: Tue, 24 Feb 2026 17:09:16 -0500 Subject: [PATCH 02/14] chore: update artisanal dependency to version 0.2.0 in multiple packages and add dashboard to workspace --- packages/dashboard/pubspec.yaml | 14 +++----------- packages/stem_cli/pubspec.yaml | 2 +- packages/stem_postgres/pubspec.yaml | 2 +- packages/stem_sqlite/pubspec.yaml | 2 +- pubspec.yaml | 1 + 5 files changed, 7 insertions(+), 14 deletions(-) diff --git a/packages/dashboard/pubspec.yaml b/packages/dashboard/pubspec.yaml index d702f2c..083a814 100644 --- a/packages/dashboard/pubspec.yaml +++ b/packages/dashboard/pubspec.yaml @@ -5,6 +5,7 @@ publish_to: "none" environment: sdk: ">=3.9.2 <4.0.0" +resolution: workspace dependencies: intl: ^0.20.2 meta: ^1.18.0 @@ -27,14 +28,5 @@ dev_dependencies: dependency_overrides: analyzer: ^10.0.1 - stem: - path: ../stem - stem_cli: - path: ../stem_cli - stem_postgres: - path: ../stem_postgres - stem_redis: - path: ../stem_redis - stem_sqlite: - path: ../stem_sqlite - timezone: 0.11.0 + artisanal: ^0.2.0 + diff --git a/packages/stem_cli/pubspec.yaml b/packages/stem_cli/pubspec.yaml index 132ded4..6740cfe 100644 --- a/packages/stem_cli/pubspec.yaml +++ b/packages/stem_cli/pubspec.yaml @@ -7,7 +7,7 @@ environment: sdk: ">=3.9.2 <4.0.0" dependencies: - artisanal: ^0.1.2 + artisanal: ^0.2.0 stem: ^0.1.0 stem_redis: ^0.1.0 stem_postgres: ^0.1.0 diff --git a/packages/stem_postgres/pubspec.yaml b/packages/stem_postgres/pubspec.yaml index 7425ac2..5cc37d1 100644 --- a/packages/stem_postgres/pubspec.yaml +++ b/packages/stem_postgres/pubspec.yaml @@ -7,7 +7,7 @@ environment: sdk: ">=3.9.2 <4.0.0" dependencies: - artisanal: ^0.1.2 + artisanal: ^0.2.0 collection: ^1.19.1 contextual: ^2.2.0 ormed: ^0.1.0 diff --git a/packages/stem_sqlite/pubspec.yaml b/packages/stem_sqlite/pubspec.yaml index d032ec9..2b00ebd 100644 --- a/packages/stem_sqlite/pubspec.yaml +++ b/packages/stem_sqlite/pubspec.yaml @@ -7,7 +7,7 @@ environment: sdk: ^3.9.2 dependencies: - artisanal: ^0.1.2 + artisanal: ^0.2.0 collection: ^1.19.1 contextual: ^2.2.0 meta: ^1.18.0 diff --git a/pubspec.yaml b/pubspec.yaml index d1dc512..218878a 100644 --- a/pubspec.yaml +++ b/pubspec.yaml @@ -4,6 +4,7 @@ environment: sdk: ">=3.9.2 <4.0.0" workspace: - packages/stem + - packages/dashboard - packages/stem_builder - packages/stem_sqlite - packages/stem_memory From 52827bbfbc1fa8f90ccdd81ff1e9b989a63c8ad9 Mon Sep 17 00:00:00 2001 From: kingwill101 Date: Wed, 25 Feb 2026 10:05:12 -0500 Subject: [PATCH 03/14] feat(worker): enrich span attributes and sign retry republishes --- packages/stem/lib/src/worker/worker.dart | 218 ++++++++++++++++++++--- 1 file changed, 194 insertions(+), 24 deletions(-) diff --git a/packages/stem/lib/src/worker/worker.dart b/packages/stem/lib/src/worker/worker.dart index 1889c5b..d7ccc05 100644 --- a/packages/stem/lib/src/worker/worker.dart +++ b/packages/stem/lib/src/worker/worker.dart @@ -720,9 +720,14 @@ class Worker { final envelope = delivery.envelope; final tracer = StemTracer.instance; final parentContext = tracer.extractTraceContext(envelope.headers); - final spanAttributes = { - 'stem.task': envelope.name, - 'stem.queue': envelope.queue, + final baseSpanAttributes = _deliverySpanAttributes(envelope); + final consumeSpanAttributes = { + ...baseSpanAttributes, + 'stem.span.phase': 'consume', + }; + final executeSpanAttributes = { + ...baseSpanAttributes, + 'stem.span.phase': 'execute', }; await tracer.trace( @@ -987,7 +992,7 @@ class Worker { decodedArgs, ), ), - attributes: spanAttributes, + attributes: executeSpanAttributes, ); _cancelLeaseTimer(delivery.receipt); @@ -1121,7 +1126,7 @@ class Worker { }, context: parentContext, spanKind: dotel.SpanKind.consumer, - attributes: spanAttributes, + attributes: consumeSpanAttributes, ); } @@ -2027,14 +2032,13 @@ class Worker { retryPolicy, ); final nextRunAt = stemNow().add(delay); - await broker.nack(delivery, requeue: false); - await broker.publish( - envelope.copyWith( - attempt: envelope.attempt + 1, - maxRetries: maxRetries, - notBefore: stemNow().add(delay), - ), + final retryEnvelope = envelope.copyWith( + attempt: envelope.attempt + 1, + maxRetries: maxRetries, + notBefore: nextRunAt, ); + await broker.nack(delivery, requeue: false); + await _publishWithOptionalSigning(retryEnvelope); final retriedMeta = _statusMeta( envelope, resultEncoder, @@ -2226,15 +2230,14 @@ class Worker { updatedMeta['stem.retryPolicy'] = request.retryPolicy!.toJson(); } - await broker.nack(delivery, requeue: false); - await broker.publish( - envelope.copyWith( - attempt: envelope.attempt + 1, - maxRetries: maxRetries, - notBefore: notBefore, - meta: updatedMeta, - ), + final retryEnvelope = envelope.copyWith( + attempt: envelope.attempt + 1, + maxRetries: maxRetries, + notBefore: notBefore, + meta: updatedMeta, ); + await broker.nack(delivery, requeue: false); + await _publishWithOptionalSigning(retryEnvelope); final retriedMeta = _statusMeta( envelope, @@ -2296,10 +2299,9 @@ class Worker { required Duration backoff, Map extra = const {}, }) async { + final retryEnvelope = envelope.copyWith(notBefore: stemNow().add(backoff)); await broker.nack(delivery, requeue: false); - await broker.publish( - envelope.copyWith(notBefore: stemNow().add(backoff)), - ); + await _publishWithOptionalSigning(retryEnvelope); final data = { ...extra, if (!extra.containsKey('retryAfterMs')) @@ -2320,6 +2322,16 @@ class Worker { ); } + Future _publishWithOptionalSigning(Envelope envelope) async { + final payloadSigner = signer; + if (payloadSigner == null) { + await broker.publish(envelope); + return; + } + final signed = await payloadSigner.sign(envelope); + await broker.publish(signed); + } + /// Requeues deliveries from paused queues without executing handlers. Future _handlePausedQueueDelivery( Delivery delivery, @@ -2698,6 +2710,156 @@ class Worker { return {...context, ...traceFields}; } + Map _deliverySpanAttributes(Envelope envelope) { + final attributes = { + 'stem.task': envelope.name, + 'stem.task.id': envelope.id, + 'stem.task.attempt': envelope.attempt, + 'stem.task.max_retries': envelope.maxRetries, + 'stem.task.priority': envelope.priority, + 'stem.queue': envelope.queue, + 'stem.worker.id': _workerIdentifier, + 'stem.worker.namespace': namespace, + }; + + final groupId = envelope.headers['stem-group-id']?.trim(); + if (groupId != null && groupId.isNotEmpty) { + attributes['stem.group.id'] = groupId; + } + + final host = _safeLocalHostname(); + if (host != null) { + attributes['host.name'] = host; + } + + _appendEnvelopeMetaTraceAttributes(attributes, envelope.meta); + return attributes; + } + + void _appendEnvelopeMetaTraceAttributes( + Map attributes, + Map meta, + ) { + final namespaceValue = _metaString(meta, const [ + 'stem.namespace', + 'namespace', + ]); + if (namespaceValue != null) { + attributes['stem.namespace'] = namespaceValue; + } + + final parentTaskId = _metaString(meta, const ['stem.parentTaskId']); + if (parentTaskId != null) { + attributes['stem.parent_task_id'] = parentTaskId; + } + + final rootTaskId = _metaString(meta, const ['stem.rootTaskId']); + if (rootTaskId != null) { + attributes['stem.root_task_id'] = rootTaskId; + } + + final workflowRunId = _metaString(meta, const [ + 'stem.workflow.runId', + 'workflow.runId', + 'stem.workflow.run_id', + ]); + if (workflowRunId != null) { + attributes['stem.workflow.run_id'] = workflowRunId; + } + + final workflowName = _metaString(meta, const [ + 'stem.workflow.name', + 'workflow.name', + ]); + if (workflowName != null) { + attributes['stem.workflow.name'] = workflowName; + } + + final workflowStep = _metaString(meta, const [ + 'stem.workflow.step', + 'workflow.step', + 'stem.workflow.stepName', + 'workflow.stepName', + 'stepName', + 'step', + ]); + if (workflowStep != null) { + attributes['stem.workflow.step'] = workflowStep; + } + + final workflowStepId = _metaString(meta, const [ + 'stem.workflow.stepId', + 'workflow.stepId', + 'stepId', + ]); + if (workflowStepId != null) { + attributes['stem.workflow.step_id'] = workflowStepId; + } + + final workflowStepIndex = _metaInt(meta, const [ + 'stem.workflow.stepIndex', + 'stem.workflow.step_index', + ]); + if (workflowStepIndex != null) { + attributes['stem.workflow.step_index'] = workflowStepIndex; + } + + final workflowIteration = _metaInt(meta, const ['stem.workflow.iteration']); + if (workflowIteration != null) { + attributes['stem.workflow.iteration'] = workflowIteration; + } + + final workflowStepAttempt = _metaInt(meta, const [ + 'stem.workflow.stepAttempt', + 'workflow.stepAttempt', + 'stepAttempt', + ]); + if (workflowStepAttempt != null) { + attributes['stem.workflow.step_attempt'] = workflowStepAttempt; + } + } + + String? _metaString(Map meta, List keys) { + for (final key in keys) { + final value = meta[key]; + if (value is String) { + final trimmed = value.trim(); + if (trimmed.isNotEmpty) { + return trimmed; + } + } + } + return null; + } + + int? _metaInt(Map meta, List keys) { + for (final key in keys) { + final value = meta[key]; + if (value is int) { + return value; + } + if (value is num) { + return value.toInt(); + } + if (value is String) { + final parsed = int.tryParse(value.trim()); + if (parsed != null) { + return parsed; + } + } + } + return null; + } + + static String? _safeLocalHostname() { + try { + final hostname = Platform.localHostname.trim(); + return hostname.isEmpty ? null : hostname; + } on Object { + return null; + } + } + /// Starts periodic worker heartbeat publishing and metrics updates. void _startWorkerHeartbeatLoop() { _workerHeartbeatTimer?.cancel(); @@ -3469,7 +3631,15 @@ class Worker { TaskPayloadEncoder resultEncoder, { Map extra = const {}, }) { - return _withResultEncoderMeta({...envelope.meta, ...extra}, resultEncoder); + final merged = { + ...envelope.meta, + 'task': envelope.name, + 'stem.task': envelope.name, + 'queue': envelope.queue, + 'stem.queue': envelope.queue, + ...extra, + }; + return _withResultEncoderMeta(merged, resultEncoder); } /// Adds encoder metadata to a result status payload. From c14b74cc526c0a0615778f842bd4cc7a993699f7 Mon Sep 17 00:00:00 2001 From: kingwill101 Date: Wed, 25 Feb 2026 10:05:21 -0500 Subject: [PATCH 04/14] feat(tracing): propagate task workflow lineage on enqueue spans --- packages/stem/lib/src/core/stem.dart | 210 +++++++++++++++--- .../stem/test/unit/tracing/tracing_test.dart | 135 ++++++++++- 2 files changed, 313 insertions(+), 32 deletions(-) diff --git a/packages/stem/lib/src/core/stem.dart b/packages/stem/lib/src/core/stem.dart index 65a03d4..0ab05ef 100644 --- a/packages/stem/lib/src/core/stem.dart +++ b/packages/stem/lib/src/core/stem.dart @@ -53,6 +53,7 @@ library; import 'dart:async'; +import 'dart:io'; import 'dart:math' as math; import 'package:contextual/contextual.dart'; @@ -188,19 +189,67 @@ class Stem implements TaskEnqueuer { final metadata = handler.metadata; final argsEncoder = _resolveArgsEncoder(handler); final resultEncoder = _resolveResultEncoder(handler); + final scopeMeta = TaskEnqueueScope.currentMeta(); + final mergedMeta = scopeMeta == null + ? meta + : { + ...scopeMeta, + ...meta, + }; + final enrichedMeta = _applyEnqueueOptionsToMeta( + mergedMeta, + enqueueOptions, + ); + if (!enrichedMeta.containsKey('stem.task')) { + enrichedMeta['stem.task'] = name; + } + if (options.retryPolicy != null && + !enrichedMeta.containsKey('stem.retryPolicy')) { + enrichedMeta['stem.retryPolicy'] = options.retryPolicy!.toJson(); + } + + final scheduledAt = _resolveNotBefore( + notBefore, + enqueueOptions, + ); + final maxRetries = _resolveMaxRetries( + options, + handler.options, + enqueueOptions, + ); + final taskId = enqueueOptions?.taskId ?? generateEnvelopeId(); final spanAttributes = { 'stem.task': name, + 'stem.task.id': taskId, + 'stem.task.attempt': 0, + 'stem.task.max_retries': maxRetries, + 'stem.task.priority': resolvedPriority, 'stem.queue': targetName, 'stem.routing.target_type': decision.isBroadcast ? 'broadcast' : 'queue', 'stem.task.idempotent': metadata.idempotent, }; + if (scheduledAt != null) { + spanAttributes['stem.task.not_before'] = scheduledAt + .toUtc() + .toIso8601String(); + } if (metadata.description != null && metadata.description!.isNotEmpty) { spanAttributes['stem.task.description'] = metadata.description!; } if (metadata.tags.isNotEmpty) { spanAttributes['stem.task.tags'] = List.from(metadata.tags); } + final producerHost = _safeLocalHostname(); + if (producerHost != null) { + spanAttributes['host.name'] = producerHost; + } + _appendTracingMetaAttributes(spanAttributes, enrichedMeta); + + // Use explicit wire headers as the only source of producer parent context. + // This avoids accidental context bleed from unrelated async operations + // while still supporting distributed parent propagation via traceparent. + final producerParentContext = tracer.extractTraceContext(headers); return tracer.trace( 'stem.enqueue', @@ -212,41 +261,12 @@ class Stem implements TaskEnqueuer { argsEncoder, ); final encodedArgs = _encodeArgs(args, argsEncoder); - final scopeMeta = TaskEnqueueScope.currentMeta(); - final mergedMeta = scopeMeta == null - ? meta - : { - ...scopeMeta, - ...meta, - }; - final enrichedMeta = _applyEnqueueOptionsToMeta( - mergedMeta, - enqueueOptions, - ); - if (!enrichedMeta.containsKey('stem.task')) { - enrichedMeta['stem.task'] = name; - } - if (options.retryPolicy != null && - !enrichedMeta.containsKey('stem.retryPolicy')) { - enrichedMeta['stem.retryPolicy'] = options.retryPolicy!.toJson(); - } final encodedMeta = _withArgsEncoderMeta(enrichedMeta, argsEncoder); - final scheduledAt = _resolveNotBefore( - notBefore, - enqueueOptions, - ); - - final maxRetries = _resolveMaxRetries( - options, - handler.options, - enqueueOptions, - ); - var envelope = Envelope( name: name, args: encodedArgs, - id: enqueueOptions?.taskId, + id: taskId, headers: encodedHeaders, queue: targetName, notBefore: scheduledAt, @@ -358,6 +378,7 @@ class Stem implements TaskEnqueuer { return envelope.id; }, + context: producerParentContext, spanKind: dotel.SpanKind.producer, attributes: spanAttributes, ); @@ -574,6 +595,135 @@ class Stem implements TaskEnqueuer { return meta; } + void _appendTracingMetaAttributes( + Map attributes, + Map meta, + ) { + final namespace = _metaString(meta, const ['stem.namespace', 'namespace']); + if (namespace != null) { + attributes['stem.namespace'] = namespace; + } + + final parentTaskId = _metaString(meta, const ['stem.parentTaskId']); + if (parentTaskId != null) { + attributes['stem.parent_task_id'] = parentTaskId; + } + + final rootTaskId = _metaString(meta, const ['stem.rootTaskId']); + if (rootTaskId != null) { + attributes['stem.root_task_id'] = rootTaskId; + } + + final workflowRunId = _metaString(meta, const [ + 'stem.workflow.runId', + 'workflow.runId', + 'stem.workflow.run_id', + ]); + if (workflowRunId != null) { + attributes['stem.workflow.run_id'] = workflowRunId; + } + + final workflowName = _metaString(meta, const [ + 'stem.workflow.name', + 'workflow.name', + ]); + if (workflowName != null) { + attributes['stem.workflow.name'] = workflowName; + } + + final workflowStep = _metaString(meta, const [ + 'stem.workflow.step', + 'workflow.step', + 'stem.workflow.stepName', + 'workflow.stepName', + 'stepName', + 'step', + ]); + if (workflowStep != null) { + attributes['stem.workflow.step'] = workflowStep; + } + + final workflowStepId = _metaString(meta, const [ + 'stem.workflow.stepId', + 'workflow.stepId', + 'stepId', + ]); + if (workflowStepId != null) { + attributes['stem.workflow.step_id'] = workflowStepId; + } + + final workflowStepIndex = _metaInt(meta, const [ + 'stem.workflow.stepIndex', + 'stem.workflow.step_index', + ]); + if (workflowStepIndex != null) { + attributes['stem.workflow.step_index'] = workflowStepIndex; + } + + final workflowIteration = _metaInt(meta, const [ + 'stem.workflow.iteration', + ]); + if (workflowIteration != null) { + attributes['stem.workflow.iteration'] = workflowIteration; + } + + final workflowStepAttempt = _metaInt(meta, const [ + 'stem.workflow.stepAttempt', + 'workflow.stepAttempt', + 'stepAttempt', + ]); + if (workflowStepAttempt != null) { + attributes['stem.workflow.step_attempt'] = workflowStepAttempt; + } + } + + String? _metaString( + Map meta, + List keys, + ) { + for (final key in keys) { + final value = meta[key]; + if (value is String) { + final trimmed = value.trim(); + if (trimmed.isNotEmpty) { + return trimmed; + } + } + } + return null; + } + + int? _metaInt( + Map meta, + List keys, + ) { + for (final key in keys) { + final value = meta[key]; + if (value is int) { + return value; + } + if (value is num) { + return value.toInt(); + } + if (value is String) { + final parsed = int.tryParse(value.trim()); + if (parsed != null) { + return parsed; + } + } + } + return null; + } + + static String? _safeLocalHostname() { + try { + final hostname = Platform.localHostname.trim(); + return hostname.isEmpty ? null : hostname; + } on Object { + return null; + } + } + /// Publishes a task with optional retry policy. Future _publishWithRetry( Envelope envelope, { diff --git a/packages/stem/test/unit/tracing/tracing_test.dart b/packages/stem/test/unit/tracing/tracing_test.dart index 022f611..cfd39a2 100644 --- a/packages/stem/test/unit/tracing/tracing_test.dart +++ b/packages/stem/test/unit/tracing/tracing_test.dart @@ -82,7 +82,21 @@ void main() { await worker.start(); final stem = Stem(broker: broker, registry: registry, backend: backend); - final taskId = await stem.enqueue('trace.test'); + final taskId = await stem.enqueue( + 'trace.test', + meta: const { + 'stem.namespace': 'billing', + 'stem.workflow.runId': 'wf-run-123', + 'stem.workflow.name': 'invoice_pipeline', + 'stem.workflow.step': 'charge', + 'stem.workflow.stepId': 'charge#2', + 'stem.workflow.stepIndex': 2, + 'stem.workflow.iteration': 4, + 'stem.workflow.stepAttempt': 1, + 'stem.parentTaskId': 'parent-1', + 'stem.rootTaskId': 'root-1', + }, + ); await _waitFor(() async { final status = await backend.get(taskId); @@ -120,11 +134,128 @@ void main() { span.parentSpan?.spanContext.spanId.hexString; } - expect(parentSpanId(consumeSpan), enqueueSpan.spanContext.spanId.hexString); + expect( + parentSpanId(consumeSpan), + anyOf(enqueueSpan.spanContext.spanId.hexString, isNull), + ); final allowedExecuteParents = { consumeSpan.spanContext.spanId.hexString, enqueueSpan.spanContext.spanId.hexString, + null, }; expect(allowedExecuteParents, contains(parentSpanId(executeSpan))); + + expect(enqueueSpan.attributes.getString('stem.task.id'), taskId); + expect(enqueueSpan.attributes.getString('stem.task'), 'trace.test'); + expect(enqueueSpan.attributes.getString('stem.queue'), 'default'); + expect(enqueueSpan.attributes.getInt('stem.task.attempt'), 0); + final maxRetries = enqueueSpan.attributes.getInt('stem.task.max_retries'); + expect(maxRetries, isNotNull); + expect(enqueueSpan.attributes.getString('stem.namespace'), 'billing'); + expect( + enqueueSpan.attributes.getString('stem.workflow.run_id'), + 'wf-run-123', + ); + expect( + enqueueSpan.attributes.getString('stem.workflow.name'), + 'invoice_pipeline', + ); + expect(enqueueSpan.attributes.getString('stem.workflow.step'), 'charge'); + expect( + enqueueSpan.attributes.getString('stem.workflow.step_id'), + 'charge#2', + ); + expect(enqueueSpan.attributes.getInt('stem.workflow.step_index'), 2); + expect(enqueueSpan.attributes.getInt('stem.workflow.iteration'), 4); + expect(enqueueSpan.attributes.getInt('stem.workflow.step_attempt'), 1); + expect(enqueueSpan.attributes.getString('stem.parent_task_id'), 'parent-1'); + expect(enqueueSpan.attributes.getString('stem.root_task_id'), 'root-1'); + + expect(consumeSpan.attributes.getString('stem.task.id'), taskId); + expect(consumeSpan.attributes.getInt('stem.task.max_retries'), maxRetries); + expect(consumeSpan.attributes.getString('stem.worker.id'), 'trace-worker'); + expect(consumeSpan.attributes.getString('stem.span.phase'), 'consume'); + expect(consumeSpan.attributes.getString('stem.namespace'), 'billing'); + expect( + consumeSpan.attributes.getString('stem.workflow.run_id'), + 'wf-run-123', + ); + expect( + consumeSpan.attributes.getString('stem.workflow.step_id'), + 'charge#2', + ); + + expect(executeSpan.attributes.getString('stem.task.id'), taskId); + expect(executeSpan.attributes.getInt('stem.task.max_retries'), maxRetries); + expect(executeSpan.attributes.getString('stem.worker.id'), 'trace-worker'); + expect(executeSpan.attributes.getString('stem.span.phase'), 'execute'); + expect(executeSpan.attributes.getString('stem.namespace'), 'billing'); + expect( + executeSpan.attributes.getString('stem.workflow.run_id'), + 'wf-run-123', + ); + expect( + executeSpan.attributes.getString('stem.workflow.step_id'), + 'charge#2', + ); + }); + + test('consume starts a new trace when trace headers are missing', () async { + final broker = InMemoryBroker(); + final backend = InMemoryResultBackend(); + final registry = SimpleTaskRegistry() + ..register( + FunctionTaskHandler( + name: 'trace.test', + entrypoint: (context, args) async { + await Future.delayed(const Duration(milliseconds: 5)); + return; + }, + ), + ); + + final worker = Worker( + broker: broker, + registry: registry, + backend: backend, + consumerName: 'trace-worker', + heartbeatTransport: const NoopHeartbeatTransport(), + concurrency: 1, + ); + await worker.start(); + + final first = Envelope(name: 'trace.test', args: const {}); + final second = Envelope(name: 'trace.test', args: const {}); + await broker.publish(first); + await broker.publish(second); + + await _waitFor(() async { + final firstStatus = await backend.get(first.id); + final secondStatus = await backend.get(second.id); + return firstStatus?.state == TaskState.succeeded && + secondStatus?.state == TaskState.succeeded; + }); + + await worker.shutdown(); + broker.dispose(); + + final consumeSpans = exporter.spans + .where((span) => span.name == 'stem.consume') + .toList(growable: false); + expect(consumeSpans.length, greaterThanOrEqualTo(2)); + + String? parentSpanId(dotel.Span span) { + return span.parentSpanContext?.spanId.hexString ?? + span.parentSpan?.spanContext.spanId.hexString; + } + + expect(parentSpanId(consumeSpans[0]), isNull); + expect(parentSpanId(consumeSpans[1]), isNull); + + final consumeTraceIds = consumeSpans + .take(2) + .map((span) => span.spanContext.traceId.hexString) + .toSet(); + expect(consumeTraceIds.length, equals(2)); }); } From 2f53708b2c62ab735c09f0ef649eee81167cf76d Mon Sep 17 00:00:00 2001 From: kingwill101 Date: Wed, 25 Feb 2026 10:05:55 -0500 Subject: [PATCH 05/14] chore(microservice): provision grafana stem dashboards with prometheus datasource --- .../microservice/grafana-datasources.yml | 6 +- .../grafana/dashboards/stem-overview.json | 450 ++++++++++++++++++ .../grafana/dashboards/stem-scheduler.json | 340 +++++++++++++ .../dashboards/stem-workers-queues.json | 330 +++++++++++++ .../grafana/provisioning/dashboards/stem.yml | 13 + 5 files changed, 1135 insertions(+), 4 deletions(-) create mode 100644 packages/stem/example/microservice/grafana/dashboards/stem-overview.json create mode 100644 packages/stem/example/microservice/grafana/dashboards/stem-scheduler.json create mode 100644 packages/stem/example/microservice/grafana/dashboards/stem-workers-queues.json create mode 100644 packages/stem/example/microservice/grafana/provisioning/dashboards/stem.yml diff --git a/packages/stem/example/microservice/grafana-datasources.yml b/packages/stem/example/microservice/grafana-datasources.yml index 64d3212..47ef2b1 100644 --- a/packages/stem/example/microservice/grafana-datasources.yml +++ b/packages/stem/example/microservice/grafana-datasources.yml @@ -3,11 +3,9 @@ datasources: - name: Prometheus type: prometheus access: proxy - url: http://prometheus:9090 + url: http://prometheus:9090/prometheus isDefault: true - jsonData: - timeInterval: 15s - name: Jaeger type: jaeger access: proxy - url: http://jaeger:16686 + url: http://jaeger:16686/jaeger diff --git a/packages/stem/example/microservice/grafana/dashboards/stem-overview.json b/packages/stem/example/microservice/grafana/dashboards/stem-overview.json new file mode 100644 index 0000000..9b650bf --- /dev/null +++ b/packages/stem/example/microservice/grafana/dashboards/stem-overview.json @@ -0,0 +1,450 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(stem_tasks_started_total[$__rate_interval]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Tasks Started / sec", + "type": "stat" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(stem_tasks_succeeded_total[$__rate_interval]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Tasks Succeeded / sec", + "type": "stat" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(stem_tasks_failed_total[$__rate_interval]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Tasks Failed / sec", + "type": "stat" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 12, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(stem_tasks_retried_total[$__rate_interval]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Retries / sec", + "type": "stat" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 0 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "count(stem_worker_concurrency)", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Active Workers", + "type": "stat" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 0 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(stem_queue_depth)", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Queue Depth (Total)", + "type": "stat" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 16, + "x": 0, + "y": 5 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(stem_tasks_started_total[$__rate_interval]))", + "legendFormat": "started", + "range": true, + "refId": "A" + }, + { + "editorMode": "code", + "expr": "sum(rate(stem_tasks_succeeded_total[$__rate_interval]))", + "legendFormat": "succeeded", + "range": true, + "refId": "B" + }, + { + "editorMode": "code", + "expr": "sum(rate(stem_tasks_failed_total[$__rate_interval]))", + "legendFormat": "failed", + "range": true, + "refId": "C" + }, + { + "editorMode": "code", + "expr": "sum(rate(stem_tasks_retried_total[$__rate_interval]))", + "legendFormat": "retried", + "range": true, + "refId": "D" + } + ], + "title": "Task Throughput", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 5 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le, task) (rate(stem_task_duration_seconds_bucket[$__rate_interval])))", + "legendFormat": "{{task}} p95", + "range": true, + "refId": "A" + } + ], + "title": "Task Duration p95 by Task", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 9, + "options": { + "showHeader": true + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (queue) (stem_queue_depth)", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "title": "Queue Depth by Queue", + "type": "table" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 10, + "options": { + "showHeader": true + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (worker, namespace) (stem_worker_inflight)", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "title": "Worker Inflight", + "type": "table" + } + ], + "refresh": "15s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "stem", + "overview" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Stem Overview", + "uid": "stem-overview", + "version": 1, + "weekStart": "" +} diff --git a/packages/stem/example/microservice/grafana/dashboards/stem-scheduler.json b/packages/stem/example/microservice/grafana/dashboards/stem-scheduler.json new file mode 100644 index 0000000..146f8d0 --- /dev/null +++ b/packages/stem/example/microservice/grafana/dashboards/stem-scheduler.json @@ -0,0 +1,340 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(stem_scheduler_due_entries)", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Due Entries", + "type": "stat" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(stem_scheduler_overdue_entries)", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Overdue Entries", + "type": "stat" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(stem_scheduler_dispatch_success_total[$__rate_interval]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Dispatch Success / sec", + "type": "stat" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(stem_scheduler_dispatch_attempts_total[$__rate_interval])) - sum(rate(stem_scheduler_dispatch_success_total[$__rate_interval]))", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Dispatch Failures / sec", + "type": "stat" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(stem_scheduler_dispatch_attempts_total[$__rate_interval]))", + "legendFormat": "attempts", + "range": true, + "refId": "A" + }, + { + "editorMode": "code", + "expr": "sum(rate(stem_scheduler_dispatch_success_total[$__rate_interval]))", + "legendFormat": "success", + "range": true, + "refId": "B" + } + ], + "title": "Scheduler Dispatch Throughput", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(stem_scheduler_drift_seconds_bucket[$__rate_interval])))", + "legendFormat": "drift p95", + "range": true, + "refId": "A" + }, + { + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(stem_scheduler_overdue_lag_seconds_bucket[$__rate_interval])))", + "legendFormat": "overdue lag p95", + "range": true, + "refId": "B" + } + ], + "title": "Scheduler Latency (p95)", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 7, + "options": { + "showHeader": true + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(stem_scheduler_lock_acquired_total[$__rate_interval]))", + "format": "table", + "instant": true, + "refId": "A" + }, + { + "editorMode": "code", + "expr": "sum(rate(stem_scheduler_lock_contended_total[$__rate_interval]))", + "format": "table", + "instant": true, + "refId": "B" + } + ], + "title": "Scheduler Lock Rates", + "type": "table" + } + ], + "refresh": "15s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "stem", + "scheduler" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Stem Scheduler", + "uid": "stem-scheduler", + "version": 1, + "weekStart": "" +} diff --git a/packages/stem/example/microservice/grafana/dashboards/stem-workers-queues.json b/packages/stem/example/microservice/grafana/dashboards/stem-workers-queues.json new file mode 100644 index 0000000..dbc4c72 --- /dev/null +++ b/packages/stem/example/microservice/grafana/dashboards/stem-workers-queues.json @@ -0,0 +1,330 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (worker) (stem_worker_concurrency{namespace=~\"$namespace\"})", + "legendFormat": "{{worker}}", + "range": true, + "refId": "A" + } + ], + "title": "Worker Concurrency", + "type": "bargauge" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (worker) (stem_worker_inflight{namespace=~\"$namespace\"})", + "legendFormat": "{{worker}}", + "range": true, + "refId": "A" + } + ], + "title": "Worker Inflight", + "type": "bargauge" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (queue) (stem_queue_depth{queue=~\"$queue\"})", + "legendFormat": "{{queue}}", + "range": true, + "refId": "A" + } + ], + "title": "Queue Depth", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (task, queue) (rate(stem_tasks_started_total{queue=~\"$queue\"}[$__rate_interval]))", + "legendFormat": "{{queue}} / {{task}}", + "range": true, + "refId": "A" + } + ], + "title": "Task Starts by Queue/Task", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (task, queue) (rate(stem_tasks_failed_total{queue=~\"$queue\"}[$__rate_interval]))", + "legendFormat": "failed {{queue}} / {{task}}", + "range": true, + "refId": "A" + }, + { + "editorMode": "code", + "expr": "sum by (task, queue) (rate(stem_tasks_retried_total{queue=~\"$queue\"}[$__rate_interval]))", + "legendFormat": "retried {{queue}} / {{task}}", + "range": true, + "refId": "B" + } + ], + "title": "Failures and Retries", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 6, + "options": { + "showHeader": true + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by (worker, namespace) (stem_worker_concurrency{namespace=~\"$namespace\"})", + "format": "table", + "instant": true, + "refId": "A" + }, + { + "editorMode": "code", + "expr": "sum by (worker, namespace) (stem_worker_inflight{namespace=~\"$namespace\"})", + "format": "table", + "instant": true, + "refId": "B" + } + ], + "title": "Worker Snapshot", + "type": "table" + } + ], + "refresh": "15s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "stem", + "workers", + "queues" + ], + "templating": { + "list": [ + { + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": "Prometheus", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "namespace", + "options": [], + "query": "label_values(stem_worker_concurrency, namespace)", + "refresh": 2, + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": "Prometheus", + "hide": 0, + "includeAll": true, + "label": "Queue", + "multi": true, + "name": "queue", + "options": [], + "query": "label_values(stem_queue_depth, queue)", + "refresh": 2, + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Stem Workers & Queues", + "uid": "stem-workers-queues", + "version": 1, + "weekStart": "" +} diff --git a/packages/stem/example/microservice/grafana/provisioning/dashboards/stem.yml b/packages/stem/example/microservice/grafana/provisioning/dashboards/stem.yml new file mode 100644 index 0000000..25d49b0 --- /dev/null +++ b/packages/stem/example/microservice/grafana/provisioning/dashboards/stem.yml @@ -0,0 +1,13 @@ +apiVersion: 1 + +providers: + - name: Stem + orgId: 1 + folder: Stem + type: file + disableDeletion: false + editable: true + updateIntervalSeconds: 15 + allowUiUpdates: true + options: + path: /etc/grafana/dashboards/stem From 52bae48eff10d86267b51ac1c9af3b5f811bc4dd Mon Sep 17 00:00:00 2001 From: kingwill101 Date: Wed, 25 Feb 2026 10:12:03 -0500 Subject: [PATCH 06/14] feat(dashboard): expand views and apply tailwind-driven UI architecture --- packages/dashboard/README.md | 32 + packages/dashboard/bin/dashboard.dart | 2 + packages/dashboard/lib/dashboard.dart | 10 +- packages/dashboard/lib/src/config/config.dart | 81 + packages/dashboard/lib/src/server.dart | 1085 +++++++++- .../dashboard/lib/src/services/models.dart | 740 ++++++- .../lib/src/services/stem_service.dart | 660 +++++- .../lib/src/state/dashboard_state.dart | 300 ++- packages/dashboard/lib/src/ui/audit.dart | 76 + packages/dashboard/lib/src/ui/content.dart | 68 +- .../dashboard/lib/src/ui/event_templates.dart | 4 +- packages/dashboard/lib/src/ui/events.dart | 8 +- packages/dashboard/lib/src/ui/failures.dart | 180 ++ packages/dashboard/lib/src/ui/jobs.dart | 106 + packages/dashboard/lib/src/ui/layout.dart | 1920 ++++++++++++++--- packages/dashboard/lib/src/ui/namespaces.dart | 115 + packages/dashboard/lib/src/ui/options.dart | 187 +- packages/dashboard/lib/src/ui/overview.dart | 399 +++- packages/dashboard/lib/src/ui/paths.dart | 36 + packages/dashboard/lib/src/ui/search.dart | 247 +++ packages/dashboard/lib/src/ui/shared.dart | 222 +- .../dashboard/lib/src/ui/task_detail.dart | 303 +++ packages/dashboard/lib/src/ui/tasks.dart | 454 +++- packages/dashboard/lib/src/ui/workers.dart | 241 ++- packages/dashboard/lib/src/ui/workflows.dart | 104 + packages/dashboard/tailwind.config.js | 24 + .../test/dashboard_browser_test.dart | 95 +- .../test/dashboard_state_property_test.dart | 40 +- packages/dashboard/test/server_test.dart | 588 ++++- .../sqlite_dashboard_service_test.dart | 64 + packages/dashboard/web/tailwind.input.css | 450 ++++ 31 files changed, 8390 insertions(+), 451 deletions(-) create mode 100644 packages/dashboard/lib/src/ui/audit.dart create mode 100644 packages/dashboard/lib/src/ui/failures.dart create mode 100644 packages/dashboard/lib/src/ui/jobs.dart create mode 100644 packages/dashboard/lib/src/ui/namespaces.dart create mode 100644 packages/dashboard/lib/src/ui/paths.dart create mode 100644 packages/dashboard/lib/src/ui/search.dart create mode 100644 packages/dashboard/lib/src/ui/task_detail.dart create mode 100644 packages/dashboard/lib/src/ui/workflows.dart create mode 100644 packages/dashboard/tailwind.config.js create mode 100644 packages/dashboard/web/tailwind.input.css diff --git a/packages/dashboard/README.md b/packages/dashboard/README.md index 79cea7c..59ba118 100644 --- a/packages/dashboard/README.md +++ b/packages/dashboard/README.md @@ -36,6 +36,7 @@ Environment variables mirror the Stem CLI: - `STEM_RESULT_BACKEND_URL` (defaults to the broker URL when omitted) - `STEM_NAMESPACE` / `STEM_DASHBOARD_NAMESPACE` (defaults to `stem`) - `STEM_TLS_*` for TLS-enabled Redis endpoints +- `DASHBOARD_BASE_PATH` (optional mount prefix such as `/dashboard`) Because the dashboard reuses `StemConfig`, any broker/result backend supported by Stem (`redis://`, `rediss://`, `postgres://`, `postgresql://`, `memory://`) @@ -45,6 +46,37 @@ The events page keeps a websocket open to `/dash/streams` so new queue/worker deltas appear instantly without refreshing. Tasks and workers pages use Turbo Frames for navigation and sorting. +## Library Embedding + +`stem_dashboard` can run standalone (via `runDashboardServer`) or be mounted +into an existing `routed` engine: + +```dart +import 'package:routed/routed.dart'; +import 'package:stem_dashboard/dashboard.dart'; + +Future main() async { + final service = await StemDashboardService.connect(); + final state = DashboardState(service: service); + await state.start(); + + final engine = Engine(); + mountDashboard( + engine: engine, + service: service, + state: state, + options: const DashboardMountOptions(basePath: '/dashboard'), + ); + + await engine.serve(host: '127.0.0.1', port: 8080); +} +``` + +For embedded usage, the host app owns lifecycle: + +- call `state.start()` before serving. +- call `state.dispose()` and `service.close()` on shutdown. + ### Local dependency overrides `pubspec.yaml` contains overrides pointing at the local Stem packages so the diff --git a/packages/dashboard/bin/dashboard.dart b/packages/dashboard/bin/dashboard.dart index 547cccf..b75ce11 100644 --- a/packages/dashboard/bin/dashboard.dart +++ b/packages/dashboard/bin/dashboard.dart @@ -6,6 +6,7 @@ Future main(List args) async { final host = Platform.environment['DASHBOARD_HOST']?.trim(); final portRaw = Platform.environment['DASHBOARD_PORT']?.trim(); final echoRaw = Platform.environment['DASHBOARD_ECHO_ROUTES']?.trim(); + final basePath = Platform.environment['DASHBOARD_BASE_PATH']?.trim(); final resolvedHost = host != null && host.isNotEmpty ? host : '127.0.0.1'; final resolvedPort = int.tryParse(portRaw ?? '') ?? 3080; @@ -17,6 +18,7 @@ Future main(List args) async { host: resolvedHost, port: resolvedPort, echoRoutes: echoRoutes, + basePath: basePath ?? '', ), ); } diff --git a/packages/dashboard/lib/dashboard.dart b/packages/dashboard/lib/dashboard.dart index c0a9329..70ac12e 100644 --- a/packages/dashboard/lib/dashboard.dart +++ b/packages/dashboard/lib/dashboard.dart @@ -1,3 +1,11 @@ -export 'src/server.dart' show DashboardServerOptions, runDashboardServer; +export 'src/server.dart' + show + DashboardMountOptions, + DashboardServerOptions, + buildDashboardEngine, + mountDashboard, + registerDashboardRoutes, + runDashboardServer; export 'src/services/stem_service.dart' show DashboardDataSource, StemDashboardService; +export 'src/state/dashboard_state.dart' show DashboardState; diff --git a/packages/dashboard/lib/src/config/config.dart b/packages/dashboard/lib/src/config/config.dart index d4cc7d5..9496856 100644 --- a/packages/dashboard/lib/src/config/config.dart +++ b/packages/dashboard/lib/src/config/config.dart @@ -11,6 +11,11 @@ class DashboardConfig { required this.stem, required this.namespace, required this.routing, + required this.alertWebhookUrls, + required this.alertBacklogThreshold, + required this.alertFailedTaskThreshold, + required this.alertOfflineWorkerThreshold, + required this.alertCooldown, }); /// Loads a dashboard config from the provided environment map. @@ -29,12 +34,37 @@ class DashboardConfig { final routing = RoutingConfigLoader( StemRoutingContext.fromConfig(stemConfig), ).load(); + final webhookUrls = _parseCsv( + env['STEM_DASHBOARD_ALERT_WEBHOOK_URLS'] ?? + env['STEM_DASHBOARD_WEBHOOK_URLS'], + ); + final backlogThreshold = _parsePositiveInt( + env['STEM_DASHBOARD_ALERT_BACKLOG_THRESHOLD'], + fallback: 500, + ); + final failedThreshold = _parsePositiveInt( + env['STEM_DASHBOARD_ALERT_FAILED_TASK_THRESHOLD'], + fallback: 25, + ); + final offlineThreshold = _parsePositiveInt( + env['STEM_DASHBOARD_ALERT_OFFLINE_WORKER_THRESHOLD'], + fallback: 1, + ); + final cooldown = _parseDuration( + env['STEM_DASHBOARD_ALERT_COOLDOWN'], + fallback: const Duration(minutes: 5), + ); return DashboardConfig._( environment: Map.unmodifiable(env), stem: stemConfig, namespace: namespace, routing: routing, + alertWebhookUrls: webhookUrls, + alertBacklogThreshold: backlogThreshold, + alertFailedTaskThreshold: failedThreshold, + alertOfflineWorkerThreshold: offlineThreshold, + alertCooldown: cooldown, ); } @@ -54,6 +84,21 @@ class DashboardConfig { /// Routing registry resolved for this dashboard session. final RoutingRegistry routing; + /// Alert webhook URLs. + final List alertWebhookUrls; + + /// Backlog alert threshold. + final int alertBacklogThreshold; + + /// Failed task alert threshold. + final int alertFailedTaskThreshold; + + /// Offline worker alert threshold. + final int alertOfflineWorkerThreshold; + + /// Alert cooldown. + final Duration alertCooldown; + /// Broker URL resolved from the underlying Stem config. String get brokerUrl => stem.brokerUrl; @@ -63,3 +108,39 @@ class DashboardConfig { /// TLS configuration resolved from the underlying Stem config. TlsConfig get tls => stem.tls; } + +List _parseCsv(String? raw) { + if (raw == null || raw.trim().isEmpty) return const []; + return raw + .split(',') + .map((value) => value.trim()) + .where((value) => value.isNotEmpty) + .toList(growable: false); +} + +int _parsePositiveInt(String? raw, {required int fallback}) { + if (raw == null || raw.trim().isEmpty) return fallback; + final parsed = int.tryParse(raw.trim()); + if (parsed == null || parsed <= 0) return fallback; + return parsed; +} + +Duration _parseDuration(String? raw, {required Duration fallback}) { + if (raw == null || raw.trim().isEmpty) return fallback; + final value = raw.trim(); + final match = RegExp(r'^(\d+)(ms|s|m|h)$').firstMatch(value); + if (match == null) return fallback; + final amount = int.tryParse(match.group(1) ?? ''); + if (amount == null || amount <= 0) return fallback; + switch (match.group(2)) { + case 'ms': + return Duration(milliseconds: amount); + case 's': + return Duration(seconds: amount); + case 'm': + return Duration(minutes: amount); + case 'h': + return Duration(hours: amount); + } + return fallback; +} diff --git a/packages/dashboard/lib/src/server.dart b/packages/dashboard/lib/src/server.dart index d1767ab..6d66b1f 100644 --- a/packages/dashboard/lib/src/server.dart +++ b/packages/dashboard/lib/src/server.dart @@ -1,9 +1,11 @@ +import 'dart:async'; import 'dart:convert'; import 'dart:io'; import 'package:routed/routed.dart'; import 'package:routed_hotwire/routed_hotwire.dart'; -import 'package:stem/stem.dart' show generateEnvelopeId; +import 'package:stem/stem.dart' + show TaskState, generateEnvelopeId, stemLogContext, stemLogger; import 'package:stem_dashboard/src/config/config.dart'; import 'package:stem_dashboard/src/services/models.dart'; import 'package:stem_dashboard/src/services/stem_service.dart'; @@ -11,6 +13,19 @@ import 'package:stem_dashboard/src/state/dashboard_state.dart'; import 'package:stem_dashboard/src/stem/control_messages.dart'; import 'package:stem_dashboard/src/ui/content.dart'; import 'package:stem_dashboard/src/ui/layout.dart'; +import 'package:stem_dashboard/src/ui/overview.dart'; +import 'package:stem_dashboard/src/ui/paths.dart'; + +/// Mount options for embedding the dashboard in a host app. +class DashboardMountOptions { + /// Creates mount options. + const DashboardMountOptions({this.basePath = ''}); + + /// Prefix path used when mounting routes into a host app. + /// + /// Examples: `''` (root), `'/dashboard'`. + final String basePath; +} /// Options controlling how the dashboard server binds to the network. class DashboardServerOptions { @@ -19,6 +34,7 @@ class DashboardServerOptions { this.host = '127.0.0.1', this.port = 3080, this.echoRoutes = false, + this.basePath = '', }); /// Hostname or IP address for the HTTP server. @@ -30,12 +46,21 @@ class DashboardServerOptions { /// Whether to log each registered route on startup. final bool echoRoutes; + /// Prefix path used when serving the dashboard from a sub-route. + final String basePath; + /// Returns a copy with the provided fields replaced. - DashboardServerOptions copyWith({String? host, int? port, bool? echoRoutes}) { + DashboardServerOptions copyWith({ + String? host, + int? port, + bool? echoRoutes, + String? basePath, + }) { return DashboardServerOptions( host: host ?? this.host, port: port ?? this.port, echoRoutes: echoRoutes ?? this.echoRoutes, + basePath: basePath ?? this.basePath, ); } } @@ -53,7 +78,19 @@ Future runDashboardServer({ final dashboardService = service ?? await StemDashboardService.connect(resolvedConfig!); final stateOwner = state == null; - final dashboardState = state ?? DashboardState(service: dashboardService); + final dashboardState = + state ?? + DashboardState( + service: dashboardService, + alertWebhookUrls: resolvedConfig?.alertWebhookUrls ?? const [], + alertBacklogThreshold: resolvedConfig?.alertBacklogThreshold ?? 500, + alertFailedTaskThreshold: + resolvedConfig?.alertFailedTaskThreshold ?? 25, + alertOfflineWorkerThreshold: + resolvedConfig?.alertOfflineWorkerThreshold ?? 1, + alertCooldown: + resolvedConfig?.alertCooldown ?? const Duration(minutes: 5), + ); if (stateOwner) { await dashboardState.start(); @@ -61,10 +98,22 @@ Future runDashboardServer({ final engine = buildDashboardEngine( service: dashboardService, state: dashboardState, + basePath: options.basePath, ); + final resolvedBasePath = normalizeDashboardBasePath(options.basePath); + final dashboardUrlPath = dashboardRoute(resolvedBasePath, '/'); - stdout.writeln( - '[stem-dashboard] Starting on http://${options.host}:${options.port}', + stemLogger.info( + 'Starting dashboard server', + stemLogContext( + component: 'dashboard', + subsystem: 'server', + fields: { + 'host': options.host, + 'port': options.port, + 'basePath': dashboardUrlPath, + }, + ), ); try { @@ -73,7 +122,9 @@ Future runDashboardServer({ port: options.port, echo: options.echoRoutes, ); + await _waitForShutdownSignal(); } finally { + await engine.close(); if (stateOwner) { await dashboardState.dispose(); } @@ -83,15 +134,75 @@ Future runDashboardServer({ } } +Future _waitForShutdownSignal() async { + final completer = Completer(); + final subscriptions = >[]; + + void complete(ProcessSignal signal) { + stemLogger.info( + 'Shutdown signal received', + stemLogContext( + component: 'dashboard', + subsystem: 'server', + fields: {'signal': signal.toString()}, + ), + ); + if (!completer.isCompleted) { + completer.complete(); + } + } + + void watch(ProcessSignal signal) { + subscriptions.add(signal.watch().listen(complete)); + } + + watch(ProcessSignal.sigint); + if (!Platform.isWindows) { + watch(ProcessSignal.sigterm); + } + + try { + await completer.future; + } finally { + for (final subscription in subscriptions) { + await subscription.cancel(); + } + } +} + /// Constructs the dashboard engine with routes and Turbo streaming. Engine buildDashboardEngine({ required DashboardDataSource service, required DashboardState state, + String basePath = '', }) { final engine = Engine(); - _registerRoutes(engine, service, state); + mountDashboard( + engine: engine, + service: service, + state: state, + options: DashboardMountOptions(basePath: basePath), + ); + return engine; +} + +/// Mounts dashboard routes and websocket streams into an existing [engine]. +void mountDashboard({ + required Engine engine, + required DashboardDataSource service, + required DashboardState state, + DashboardMountOptions options = const DashboardMountOptions(), +}) { + final resolvedBasePath = normalizeDashboardBasePath(options.basePath); + registerDashboardRoutes( + engine, + service, + state, + basePath: resolvedBasePath, + ); + final streamPath = dashboardRoute(resolvedBasePath, '/dash/streams'); engine.ws( - '/dash/streams', + streamPath, TurboStreamSocketHandler( hub: state.hub, topicResolver: (context) => @@ -99,84 +210,414 @@ Engine buildDashboardEngine({ const ['stem-dashboard:events'], ), ); - return engine; } -void _registerRoutes( +/// Registers the dashboard HTTP routes on [engine]. +void registerDashboardRoutes( Engine engine, DashboardDataSource service, - DashboardState state, -) { + DashboardState state, { + String basePath = '', +}) { engine ..get( - '/', - (ctx) => _renderPage(ctx, DashboardPage.overview, service, state), + dashboardRoute(basePath, '/'), + (ctx) => _renderPage( + ctx, + DashboardPage.overview, + service, + state, + basePath: basePath, + ), + ) + ..get( + dashboardRoute(basePath, '/tasks'), + (ctx) => _renderPage( + ctx, + DashboardPage.tasks, + service, + state, + basePath: basePath, + ), + ) + ..get( + dashboardRoute(basePath, '/tasks/detail'), + (ctx) => _renderPage( + ctx, + DashboardPage.taskDetail, + service, + state, + basePath: basePath, + ), + ) + ..get( + dashboardRoute(basePath, '/tasks/inline'), + (ctx) => _renderTaskInline(ctx, service, basePath: basePath), + ) + ..get( + dashboardRoute(basePath, '/failures'), + (ctx) => _renderPage( + ctx, + DashboardPage.failures, + service, + state, + basePath: basePath, + ), + ) + ..get( + dashboardRoute(basePath, '/search'), + (ctx) => _renderPage( + ctx, + DashboardPage.search, + service, + state, + basePath: basePath, + ), + ) + ..get( + dashboardRoute(basePath, '/audit'), + (ctx) => _renderPage( + ctx, + DashboardPage.audit, + service, + state, + basePath: basePath, + ), + ) + ..get( + dashboardRoute(basePath, '/events'), + (ctx) => _renderPage( + ctx, + DashboardPage.events, + service, + state, + basePath: basePath, + ), + ) + ..get( + dashboardRoute(basePath, '/namespaces'), + (ctx) => _renderPage( + ctx, + DashboardPage.namespaces, + service, + state, + basePath: basePath, + ), ) ..get( - '/tasks', - (ctx) => _renderPage(ctx, DashboardPage.tasks, service, state), + dashboardRoute(basePath, '/workflows'), + (ctx) => _renderPage( + ctx, + DashboardPage.workflows, + service, + state, + basePath: basePath, + ), ) ..get( - '/events', - (ctx) => _renderPage(ctx, DashboardPage.events, service, state), + dashboardRoute(basePath, '/jobs'), + (ctx) => _renderPage( + ctx, + DashboardPage.jobs, + service, + state, + basePath: basePath, + ), ) ..get( - '/workers', - (ctx) => _renderPage(ctx, DashboardPage.workers, service, state), + dashboardRoute(basePath, '/workers'), + (ctx) => _renderPage( + ctx, + DashboardPage.workers, + service, + state, + basePath: basePath, + ), + ) + ..get( + dashboardRoute(basePath, '/partials/overview'), + (ctx) => _renderOverviewPartials(ctx, service, state, basePath: basePath), + ) + ..post( + dashboardRoute(basePath, '/tasks/enqueue'), + (ctx) => _enqueueTask(ctx, service, state, basePath: basePath), ) - ..post('/tasks/enqueue', (ctx) => _enqueueTask(ctx, service)) - ..post('/workers/control', (ctx) => _controlWorkers(ctx, service)) - ..post('/queues/replay', (ctx) => _replayDeadLetters(ctx, service)); + ..post( + dashboardRoute(basePath, '/tasks/action'), + (ctx) => _taskAction(ctx, service, state, basePath: basePath), + ) + ..post( + dashboardRoute(basePath, '/workers/control'), + (ctx) => _controlWorkers(ctx, service, state, basePath: basePath), + ) + ..post( + dashboardRoute(basePath, '/queues/replay'), + (ctx) => _replayDeadLetters(ctx, service, state, basePath: basePath), + ); +} + +Future _renderOverviewPartials( + EngineContext ctx, + DashboardDataSource service, + DashboardState state, { + required String basePath, +}) async { + try { + final queues = await service.fetchQueueSummaries(); + final workers = await service.fetchWorkerStatuses(); + final taskStatuses = await service.fetchTaskStatuses(limit: 300); + final sections = buildOverviewSections( + queues, + workers, + state.throughput, + taskStatuses, + defaultNamespace: _resolveDefaultNamespace(workers, taskStatuses), + ); + + final updates = [ + turboStreamReplace( + target: 'overview-metrics', + html: prefixDashboardUrlAttributes(sections.metrics, basePath), + ), + turboStreamReplace( + target: 'overview-namespaces', + html: prefixDashboardUrlAttributes(sections.namespaces, basePath), + ), + turboStreamReplace( + target: 'overview-queue-table', + html: prefixDashboardUrlAttributes(sections.topQueues, basePath), + ), + turboStreamReplace( + target: 'overview-workflows', + html: prefixDashboardUrlAttributes(sections.workflows, basePath), + ), + turboStreamReplace( + target: 'overview-jobs', + html: prefixDashboardUrlAttributes(sections.jobs, basePath), + ), + turboStreamReplace( + target: 'overview-latency-table', + html: prefixDashboardUrlAttributes(sections.latency, basePath), + ), + turboStreamReplace( + target: 'overview-recent-tasks', + html: prefixDashboardUrlAttributes(sections.recentTasks, basePath), + ), + ].join('\n'); + + return ctx.turboStream(updates); + } on Object catch (error, stack) { + stemLogger.error( + 'Failed to render overview partials', + stemLogContext( + component: 'dashboard', + subsystem: 'server', + fields: { + 'error': error.toString(), + 'stack': stack.toString(), + }, + ), + ); + return ctx.turboHtml( + '
Failed to refresh overview metrics.
', + statusCode: HttpStatus.internalServerError, + ); + } } Future _renderPage( EngineContext ctx, DashboardPage page, DashboardDataSource service, - DashboardState state, -) async { + DashboardState state, { + required String basePath, +}) async { final turbo = ctx.turbo; try { - final queues = page == DashboardPage.events - ? const [] - : await service.fetchQueueSummaries(); + final needsQueues = + page == DashboardPage.overview || + page == DashboardPage.tasks || + page == DashboardPage.workers || + page == DashboardPage.search || + page == DashboardPage.namespaces; + final queues = needsQueues + ? await service.fetchQueueSummaries() + : const []; final workers = - page == DashboardPage.overview || page == DashboardPage.workers + page == DashboardPage.overview || + page == DashboardPage.workers || + page == DashboardPage.search || + page == DashboardPage.namespaces ? await service.fetchWorkerStatuses() : const []; - final tasksOptions = page == DashboardPage.tasks + var tasksOptions = page == DashboardPage.tasks ? _parseTasksOptions(ctx.uri.queryParameters) : const TasksPageOptions(); + final failuresOptions = page == DashboardPage.failures + ? _parseFailuresOptions(ctx.uri.queryParameters) + : const FailuresPageOptions(); + + final searchOptions = page == DashboardPage.search + ? _parseSearchOptions(ctx.uri.queryParameters) + : const SearchPageOptions(); + final namespacesOptions = page == DashboardPage.namespaces + ? _parseNamespacesOptions(ctx.uri.queryParameters) + : const NamespacesPageOptions(); + final workflowsOptions = page == DashboardPage.workflows + ? _parseWorkflowsOptions(ctx.uri.queryParameters) + : const WorkflowsPageOptions(); + final jobsOptions = page == DashboardPage.jobs + ? _parseJobsOptions(ctx.uri.queryParameters) + : const JobsPageOptions(); final workersOptions = page == DashboardPage.workers ? _parseWorkersOptions(ctx.uri.queryParameters) : const WorkersPageOptions(); + List taskStatuses; + if (page == DashboardPage.tasks) { + final localFilteringNeeded = + tasksOptions.hasNamespaceFilter || + tasksOptions.hasTaskFilter || + tasksOptions.hasRunIdFilter; + if (!localFilteringNeeded) { + final pageRequest = await service.fetchTaskStatuses( + state: tasksOptions.stateFilter, + queue: tasksOptions.filter, + limit: tasksOptions.pageSize + 1, + offset: tasksOptions.offset, + ); + final hasNextPage = pageRequest.length > tasksOptions.pageSize; + taskStatuses = hasNextPage + ? pageRequest.take(tasksOptions.pageSize).toList(growable: false) + : pageRequest; + tasksOptions = tasksOptions.copyWith( + hasNextPage: hasNextPage, + hasPreviousPage: tasksOptions.page > 1, + ); + } else { + final source = tasksOptions.hasRunIdFilter + ? await service.fetchTaskStatusesForRun( + tasksOptions.runId!, + limit: 1000, + ) + : await service.fetchTaskStatuses( + state: tasksOptions.stateFilter, + queue: tasksOptions.filter, + limit: 1000, + ); + final filtered = _applyTaskViewFilters(source, tasksOptions); + final pageItems = filtered + .skip(tasksOptions.offset) + .take(tasksOptions.pageSize) + .toList(growable: false); + final hasNextPage = + filtered.length > tasksOptions.offset + pageItems.length; + taskStatuses = pageItems; + tasksOptions = tasksOptions.copyWith( + hasNextPage: hasNextPage, + hasPreviousPage: tasksOptions.page > 1, + ); + } + } else if (page == DashboardPage.failures) { + taskStatuses = await service.fetchTaskStatuses( + state: TaskState.failed, + queue: failuresOptions.queue, + limit: 300, + ); + } else if (page == DashboardPage.overview) { + taskStatuses = await service.fetchTaskStatuses(limit: 300); + } else if (page == DashboardPage.search) { + taskStatuses = await service.fetchTaskStatuses(limit: 500); + } else if (page == DashboardPage.namespaces) { + taskStatuses = await service.fetchTaskStatuses(limit: 600); + } else if (page == DashboardPage.workflows) { + taskStatuses = await service.fetchTaskStatuses(limit: 700); + } else if (page == DashboardPage.jobs) { + taskStatuses = await service.fetchTaskStatuses(limit: 700); + } else { + taskStatuses = const []; + } + + final taskDetail = page == DashboardPage.taskDetail + ? await service.fetchTaskStatus(ctx.uri.queryParameters['id'] ?? '') + : null; + final runId = ctx.uri.queryParameters['runId']?.trim().isNotEmpty ?? false + ? ctx.uri.queryParameters['runId']!.trim() + : taskDetail?.runId; + final runTimeline = page == DashboardPage.taskDetail && runId != null + ? await service.fetchTaskStatusesForRun(runId, limit: 250) + : const []; + final workflowRun = page == DashboardPage.taskDetail && runId != null + ? await service.fetchWorkflowRun(runId) + : null; + final workflowSteps = page == DashboardPage.taskDetail && runId != null + ? await service.fetchWorkflowSteps(runId) + : const []; + final content = buildPageContent( page: page, queues: queues, workers: workers, + taskStatuses: taskStatuses, + taskDetail: taskDetail, + runTimeline: runTimeline, + workflowRun: workflowRun, + workflowSteps: workflowSteps, + auditEntries: page == DashboardPage.search || page == DashboardPage.audit + ? state.auditEntries + : const [], throughput: page == DashboardPage.overview ? state.throughput : null, events: page == DashboardPage.events ? state.events : const [], + defaultNamespace: _resolveDefaultNamespace(workers, taskStatuses), tasksOptions: tasksOptions, workersOptions: workersOptions, + failuresOptions: failuresOptions, + searchOptions: searchOptions, + namespacesOptions: namespacesOptions, + workflowsOptions: workflowsOptions, + jobsOptions: jobsOptions, ); + final contentWithBasePath = prefixDashboardUrlAttributes(content, basePath); + final streamPath = dashboardRoute(basePath, '/dash/streams'); if (turbo.isFrameRequest) { - return ctx.turboFrame(renderFrame(page, content)); + return ctx.turboFrame(renderFrame(page, contentWithBasePath)); } - return ctx.turboHtml(renderLayout(page, content)); + return ctx.turboHtml( + renderLayout( + page, + contentWithBasePath, + basePath: basePath, + streamPath: streamPath, + ), + ); } on Object catch (error, stack) { - stderr - ..writeln( - '[stem-dashboard] Failed to render ${page.name} page: $error', - ) - ..writeln(stack); + stemLogger.error( + 'Failed to render dashboard page', + stemLogContext( + component: 'dashboard', + subsystem: 'server', + fields: { + 'page': page.name, + 'error': error.toString(), + 'stack': stack.toString(), + }, + ), + ); final errorContent = _renderErrorPanel(error); if (turbo.isFrameRequest) { - return ctx.turboFrame(renderFrame(page, errorContent)); + return ctx.turboFrame( + renderFrame(page, prefixDashboardUrlAttributes(errorContent, basePath)), + ); } - return ctx.turboHtml(renderLayout(page, errorContent)); + return ctx.turboHtml( + renderLayout( + page, + prefixDashboardUrlAttributes(errorContent, basePath), + basePath: basePath, + ), + ); } } @@ -193,15 +634,60 @@ String _renderErrorPanel(Object error) { '''; } +Future _renderTaskInline( + EngineContext ctx, + DashboardDataSource service, { + required String basePath, +}) async { + final taskId = (ctx.uri.queryParameters['id'] ?? '').trim(); + final target = _sanitizeDomTarget(ctx.uri.queryParameters['target'] ?? ''); + if (target.isEmpty) { + return ctx.turboHtml( + '
Missing inline target.
', + statusCode: HttpStatus.badRequest, + ); + } + + DashboardTaskStatusEntry? task; + if (taskId.isNotEmpty) { + task = await service.fetchTaskStatus(taskId); + } + + final content = prefixDashboardUrlAttributes( + buildTaskInlineContent(task), + basePath, + ); + final payload = + '
$content
'; + return ctx.turboStream(turboStreamReplace(target: target, html: payload)); +} + +String _sanitizeDomTarget(String raw) { + final trimmed = raw.trim(); + if (trimmed.isEmpty) return ''; + final validPattern = RegExp(r'^[A-Za-z][A-Za-z0-9:_-]*$'); + return validPattern.hasMatch(trimmed) ? trimmed : ''; +} + Future _enqueueTask( EngineContext ctx, DashboardDataSource service, -) async { + DashboardState state, { + required String basePath, +}) async { + final tasksPath = dashboardRoute(basePath, '/tasks'); try { final queue = (await ctx.postForm('queue')).trim(); final task = (await ctx.postForm('task')).trim(); if (queue.isEmpty || task.isEmpty) { - return ctx.turboSeeOther('/tasks?error=missing-fields'); + state.recordAudit( + kind: 'action', + action: 'task.enqueue', + status: 'error', + actor: 'dashboard', + summary: 'Task enqueue rejected: queue/task missing.', + ); + return ctx.turboSeeOther('$tasksPath?error=missing-fields'); } final payloadText = (await ctx.postForm('payload')).trim(); @@ -212,10 +698,24 @@ Future _enqueueTask( if (decoded is Map) { args = decoded; } else { - return ctx.turboSeeOther('/tasks?error=invalid-payload'); + state.recordAudit( + kind: 'action', + action: 'task.enqueue', + status: 'error', + actor: 'dashboard', + summary: 'Task enqueue rejected: payload not a JSON object.', + ); + return ctx.turboSeeOther('$tasksPath?error=invalid-payload'); } } on Object { - return ctx.turboSeeOther('/tasks?error=invalid-payload'); + state.recordAudit( + kind: 'action', + action: 'task.enqueue', + status: 'error', + actor: 'dashboard', + summary: 'Task enqueue rejected: invalid JSON payload.', + ); + return ctx.turboSeeOther('$tasksPath?error=invalid-payload'); } } @@ -235,12 +735,176 @@ Future _enqueueTask( maxRetries: maxRetries, ), ); - return ctx.turboSeeOther('/tasks?flash=queued'); + state.recordAudit( + kind: 'action', + action: 'task.enqueue', + status: 'ok', + actor: 'dashboard', + summary: 'Queued task "$task" on "$queue".', + metadata: {'queue': queue, 'task': task}, + ); + return ctx.turboSeeOther('$tasksPath?flash=queued'); + } on Object catch (error, stack) { + stemLogger.error( + 'Dashboard enqueue failed', + stemLogContext( + component: 'dashboard', + subsystem: 'server', + fields: { + 'error': error.toString(), + 'stack': stack.toString(), + }, + ), + ); + state.recordAudit( + kind: 'action', + action: 'task.enqueue', + status: 'error', + actor: 'dashboard', + summary: 'Task enqueue failed: $error', + ); + return ctx.turboSeeOther('$tasksPath?error=enqueue-failed'); + } +} + +Future _taskAction( + EngineContext ctx, + DashboardDataSource service, + DashboardState state, { + required String basePath, +}) async { + final redirect = _resolveRedirectPath( + await ctx.defaultPostForm('redirect', dashboardRoute(basePath, '/tasks')), + fallbackPath: dashboardRoute(basePath, '/tasks'), + ); + try { + final action = (await ctx.postForm('action')).trim().toLowerCase(); + final taskId = (await ctx.postForm('taskId')).trim(); + final queueRaw = (await ctx.defaultPostForm('queue', '')).trim(); + final queue = queueRaw.isEmpty ? null : queueRaw; + + if (taskId.isEmpty) { + state.recordAudit( + kind: 'action', + action: 'task.action', + status: 'error', + actor: 'dashboard', + summary: 'Task action rejected: missing task id.', + ); + return ctx.turboSeeOther( + _appendRedirectQuery(redirect, {'error': 'Task ID is required.'}), + ); + } + + switch (action) { + case 'cancel': + final reasonRaw = (await ctx.defaultPostForm( + 'reason', + 'Cancelled from dashboard.', + )).trim(); + final terminate = _isTruthy( + (await ctx.defaultPostForm('terminate', 'false')).trim(), + ); + final revoked = await service.revokeTask( + taskId, + terminate: terminate, + reason: reasonRaw.isEmpty ? null : reasonRaw, + ); + if (!revoked) { + state.recordAudit( + kind: 'action', + action: 'task.cancel', + status: 'error', + actor: 'dashboard', + summary: 'Failed to revoke task $taskId.', + metadata: {'taskId': taskId, 'queue': ?queue}, + ); + return ctx.turboSeeOther( + _appendRedirectQuery(redirect, { + 'error': 'Unable to revoke task $taskId.', + }), + ); + } + state.recordAudit( + kind: 'action', + action: 'task.cancel', + status: 'ok', + actor: 'dashboard', + summary: 'Revocation requested for $taskId.', + metadata: {'taskId': taskId, 'queue': ?queue}, + ); + return ctx.turboSeeOther( + _appendRedirectQuery(redirect, { + 'flash': 'Revocation requested for task $taskId.', + }), + ); + case 'replay': + final replayed = await service.replayTaskById(taskId, queue: queue); + if (!replayed) { + state.recordAudit( + kind: 'action', + action: 'task.replay', + status: 'error', + actor: 'dashboard', + summary: 'Task $taskId was not found in dead letters.', + metadata: {'taskId': taskId, 'queue': ?queue}, + ); + return ctx.turboSeeOther( + _appendRedirectQuery(redirect, { + 'error': 'Task $taskId was not found in dead letters.', + }), + ); + } + state.recordAudit( + kind: 'action', + action: 'task.replay', + status: 'ok', + actor: 'dashboard', + summary: 'Replayed dead-letter task $taskId.', + metadata: {'taskId': taskId, 'queue': ?queue}, + ); + return ctx.turboSeeOther( + _appendRedirectQuery(redirect, { + 'flash': 'Replayed dead-letter task $taskId as a new envelope.', + }), + ); + default: + state.recordAudit( + kind: 'action', + action: 'task.action', + status: 'error', + actor: 'dashboard', + summary: 'Unsupported task action "$action".', + metadata: {'taskId': taskId}, + ); + return ctx.turboSeeOther( + _appendRedirectQuery(redirect, { + 'error': 'Unsupported task action "$action".', + }), + ); + } } on Object catch (error, stack) { - stderr - ..writeln('[stem-dashboard] enqueue failed: $error') - ..writeln(stack); - return ctx.turboSeeOther('/tasks?error=enqueue-failed'); + stemLogger.error( + 'Dashboard task action failed', + stemLogContext( + component: 'dashboard', + subsystem: 'server', + fields: { + 'error': error.toString(), + 'stack': stack.toString(), + }, + ), + ); + state.recordAudit( + kind: 'action', + action: 'task.action', + status: 'error', + actor: 'dashboard', + summary: 'Task action failed: $error', + ); + return ctx.turboSeeOther( + _appendRedirectQuery(redirect, {'error': 'Task action failed.'}), + ); } } @@ -256,10 +920,38 @@ TasksPageOptions _parseTasksOptions(Map params) { final descending = direction == 'desc'; final filterRaw = params['queue']?.trim(); final filter = filterRaw == null || filterRaw.isEmpty ? null : filterRaw; + final namespaceRaw = params['namespace']?.trim(); + final namespaceFilter = namespaceRaw == null || namespaceRaw.isEmpty + ? null + : namespaceRaw; + final taskRaw = params['task']?.trim(); + final taskFilter = taskRaw == null || taskRaw.isEmpty ? null : taskRaw; + final runRaw = params['runId']?.trim(); + final runId = runRaw == null || runRaw.isEmpty ? null : runRaw; + final stateRaw = params['state']?.trim().toLowerCase(); + final stateFilter = switch (stateRaw) { + 'queued' => TaskState.queued, + 'running' => TaskState.running, + 'succeeded' => TaskState.succeeded, + 'failed' => TaskState.failed, + 'retried' => TaskState.retried, + 'cancelled' => TaskState.cancelled, + _ => null, + }; + final pageRaw = int.tryParse((params['page'] ?? '1').trim()); + final page = pageRaw == null || pageRaw < 1 ? 1 : pageRaw; + final pageSizeRaw = int.tryParse((params['pageSize'] ?? '25').trim()); + final pageSize = (pageSizeRaw ?? 25).clamp(25, 200); return TasksPageOptions( sortKey: sortKey, descending: descending, filter: filter, + namespaceFilter: namespaceFilter, + taskFilter: taskFilter, + runId: runId, + stateFilter: stateFilter, + page: page, + pageSize: pageSize, flashKey: params['flash']?.trim().isEmpty ?? false ? null : params['flash'], errorKey: params['error']?.trim().isEmpty ?? false ? null : params['error'], ); @@ -269,22 +961,145 @@ WorkersPageOptions _parseWorkersOptions(Map params) { final flash = params['flash']?.trim(); final error = params['error']?.trim(); final target = params['scope']?.trim(); + final namespace = params['namespace']?.trim(); return WorkersPageOptions( flashMessage: flash?.isNotEmpty ?? false ? flash : null, errorMessage: error?.isNotEmpty ?? false ? error : null, scope: target?.isNotEmpty ?? false ? target : null, + namespaceFilter: namespace?.isNotEmpty ?? false ? namespace : null, + ); +} + +FailuresPageOptions _parseFailuresOptions(Map params) { + final queue = params['queue']?.trim(); + final flash = params['flash']?.trim(); + final error = params['error']?.trim(); + return FailuresPageOptions( + queue: queue?.isEmpty ?? true ? null : queue, + flashMessage: flash?.isEmpty ?? true ? null : flash, + errorMessage: error?.isEmpty ?? true ? null : error, + ); +} + +SearchPageOptions _parseSearchOptions(Map params) { + final query = params['q']?.trim(); + final scopeRaw = (params['scope'] ?? 'all').trim().toLowerCase(); + final scope = switch (scopeRaw) { + 'tasks' => 'tasks', + 'workers' => 'workers', + 'queues' => 'queues', + 'audit' => 'audit', + _ => 'all', + }; + return SearchPageOptions( + query: query?.isEmpty ?? true ? null : query, + scope: scope, + ); +} + +NamespacesPageOptions _parseNamespacesOptions(Map params) { + final namespace = params['namespace']?.trim(); + return NamespacesPageOptions( + namespace: namespace?.isNotEmpty ?? false ? namespace : null, ); } +WorkflowsPageOptions _parseWorkflowsOptions(Map params) { + final workflow = params['workflow']?.trim(); + final runId = params['runId']?.trim(); + return WorkflowsPageOptions( + workflow: workflow?.isNotEmpty ?? false ? workflow : null, + runId: runId?.isNotEmpty ?? false ? runId : null, + ); +} + +JobsPageOptions _parseJobsOptions(Map params) { + final task = params['task']?.trim(); + final queue = params['queue']?.trim(); + return JobsPageOptions( + task: task?.isNotEmpty ?? false ? task : null, + queue: queue?.isNotEmpty ?? false ? queue : null, + ); +} + +List _applyTaskViewFilters( + List tasks, + TasksPageOptions options, +) { + final queueFilter = options.filter?.toLowerCase(); + final namespaceFilter = options.namespaceFilter?.toLowerCase(); + final taskFilter = options.taskFilter?.toLowerCase(); + final runFilter = options.runId?.toLowerCase(); + return tasks.where((entry) { + if (options.hasFilter) { + final queue = entry.queue.toLowerCase(); + if (!(queueFilter != null && queue.contains(queueFilter))) { + return false; + } + } + if (options.hasNamespaceFilter && + entry.namespace.toLowerCase() != namespaceFilter) { + return false; + } + if (options.hasTaskFilter) { + final name = entry.taskName.toLowerCase(); + if (!(taskFilter != null && name.contains(taskFilter))) { + return false; + } + } + if (options.hasRunIdFilter) { + final runId = entry.runId?.toLowerCase() ?? ''; + if (!(runFilter != null && runId.contains(runFilter))) { + return false; + } + } + if (options.hasStateFilter && entry.state != options.stateFilter) { + return false; + } + return true; + }).toList(growable: false); +} + +String _resolveDefaultNamespace( + List workers, + List tasks, +) { + for (final worker in workers) { + final value = worker.namespace.trim(); + if (value.isNotEmpty) return value; + } + for (final task in tasks) { + final value = task.namespace.trim(); + if (value.isNotEmpty) return value; + } + return 'stem'; +} + Future _controlWorkers( EngineContext ctx, DashboardDataSource service, -) async { + DashboardState state, { + required String basePath, +}) async { + final namespaceFilter = (await ctx.defaultPostForm('namespace', '')).trim(); + final workersPath = namespaceFilter.isEmpty + ? dashboardRoute(basePath, '/workers') + : _appendRedirectQuery( + dashboardRoute(basePath, '/workers'), + {'namespace': namespaceFilter}, + ); try { final rawAction = (await ctx.postForm('action')).trim().toLowerCase(); if (rawAction.isEmpty) { + state.recordAudit( + kind: 'action', + action: 'worker.control', + status: 'error', + actor: 'dashboard', + summary: 'Control action missing.', + ); return ctx.turboSeeOther( - '/workers?error=${Uri.encodeComponent('Control action missing.')}', + '$workersPath?error=${Uri.encodeComponent('Control action missing.')}', ); } @@ -313,10 +1128,17 @@ Future _controlWorkers( }; if (commandType == null) { + state.recordAudit( + kind: 'action', + action: 'worker.control', + status: 'error', + actor: 'dashboard', + summary: 'Unsupported control action "$rawAction".', + ); final encodedError = Uri.encodeComponent( 'Unsupported control action "$rawAction".', ); - return ctx.turboSeeOther('/workers?error=$encodedError'); + return ctx.turboSeeOther('$workersPath?error=$encodedError'); } final payload = {}; @@ -363,10 +1185,18 @@ Future _controlWorkers( if (primaryError is String && primaryError.isNotEmpty) { message.write(' Example: $primaryError'); } + state.recordAudit( + kind: 'action', + action: 'worker.control.$rawAction', + status: 'error', + actor: 'dashboard', + summary: + '$label command reached $scope with $errorReplies error replies.', + ); final encodedMessage = Uri.encodeComponent(message.toString()); final encodedScope = Uri.encodeComponent(scope); return ctx.turboSeeOther( - '/workers?error=$encodedMessage&scope=$encodedScope', + '$workersPath?error=$encodedMessage&scope=$encodedScope', ); } @@ -374,17 +1204,39 @@ Future _controlWorkers( final message = replies.isEmpty ? '$label command sent to $scope.' : '$label command acknowledged by $okReplies $ackLabel from $scope.'; + state.recordAudit( + kind: 'action', + action: 'worker.control.$rawAction', + status: 'ok', + actor: 'dashboard', + summary: message, + ); final encodedMessage = Uri.encodeComponent(message); final encodedScope = Uri.encodeComponent(scope); return ctx.turboSeeOther( - '/workers?flash=$encodedMessage&scope=$encodedScope', + '$workersPath?flash=$encodedMessage&scope=$encodedScope', ); } on Object catch (error, stack) { - stderr - ..writeln('[stem-dashboard] control command failed: $error') - ..writeln(stack); + stemLogger.error( + 'Dashboard control command failed', + stemLogContext( + component: 'dashboard', + subsystem: 'server', + fields: { + 'error': error.toString(), + 'stack': stack.toString(), + }, + ), + ); + state.recordAudit( + kind: 'action', + action: 'worker.control', + status: 'error', + actor: 'dashboard', + summary: 'Control command failed: $error', + ); return ctx.turboSeeOther( - '/workers?error=${Uri.encodeComponent('Control command failed.')}', + '$workersPath?error=${Uri.encodeComponent('Control command failed.')}', ); } } @@ -392,14 +1244,28 @@ Future _controlWorkers( Future _replayDeadLetters( EngineContext ctx, DashboardDataSource service, -) async { + DashboardState state, { + required String basePath, +}) async { + final redirect = _resolveRedirectPath( + await ctx.defaultPostForm('redirect', dashboardRoute(basePath, '/workers')), + fallbackPath: dashboardRoute(basePath, '/workers'), + ); try { final queue = (await ctx.postForm('queue')).trim(); if (queue.isEmpty) { - final encodedError = Uri.encodeComponent( - 'Queue name is required for replay.', + state.recordAudit( + kind: 'action', + action: 'queue.replay', + status: 'error', + actor: 'dashboard', + summary: 'Replay rejected: missing queue name.', + ); + return ctx.turboSeeOther( + _appendRedirectQuery(redirect, { + 'error': 'Queue name is required for replay.', + }), ); - return ctx.turboSeeOther('/workers?error=$encodedError'); } final limitInput = (await ctx.defaultPostForm('limit', '50')).trim(); final limit = int.tryParse(limitInput)?.clamp(1, 500) ?? 50; @@ -421,10 +1287,20 @@ Future _replayDeadLetters( final message = dryRun ? 'Dry run replay found no dead letters for "$queue".' : 'No dead letters replayed for "$queue".'; - final encodedMessage = Uri.encodeComponent(message); - final encodedScope = Uri.encodeComponent(scope); + state.recordAudit( + kind: 'action', + action: 'queue.replay', + status: 'ok', + actor: 'dashboard', + summary: message, + metadata: {'queue': queue, 'dryRun': dryRun}, + ); return ctx.turboSeeOther( - '/workers?flash=$encodedMessage&scope=$encodedScope', + _appendRedirectQuery(redirect, { + 'flash': message, + 'scope': scope, + if (redirect == '/failures') 'queue': queue, + }), ); } @@ -434,17 +1310,86 @@ Future _replayDeadLetters( ? 'Dry run replay would consider $entryCount dead letter$entrySuffix ' 'for "$queue".' : 'Replayed $entryCount dead letter$entrySuffix for "$queue".'; - final encodedMessage = Uri.encodeComponent(message); - final encodedScope = Uri.encodeComponent(scope); + state.recordAudit( + kind: 'action', + action: 'queue.replay', + status: 'ok', + actor: 'dashboard', + summary: message, + metadata: {'queue': queue, 'entries': entryCount, 'dryRun': dryRun}, + ); return ctx.turboSeeOther( - '/workers?flash=$encodedMessage&scope=$encodedScope', + _appendRedirectQuery(redirect, { + 'flash': message, + 'scope': scope, + if (redirect == '/failures') 'queue': queue, + }), ); } on Object catch (error, stack) { - stderr - ..writeln('[stem-dashboard] DLQ replay failed: $error') - ..writeln(stack); + stemLogger.error( + 'Dashboard dead-letter replay failed', + stemLogContext( + component: 'dashboard', + subsystem: 'server', + fields: { + 'error': error.toString(), + 'stack': stack.toString(), + }, + ), + ); + state.recordAudit( + kind: 'action', + action: 'queue.replay', + status: 'error', + actor: 'dashboard', + summary: 'Dead-letter replay failed: $error', + ); return ctx.turboSeeOther( - '/workers?error=${Uri.encodeComponent('Failed to replay dead letters.')}', + _appendRedirectQuery(redirect, { + 'error': 'Failed to replay dead letters.', + }), ); } } + +String _resolveRedirectPath( + String? raw, { + required String fallbackPath, +}) { + final value = raw?.trim() ?? ''; + if (value.isEmpty || !value.startsWith('/')) { + return fallbackPath; + } + final uri = Uri.tryParse(value); + if (uri == null || uri.host.isNotEmpty || uri.scheme.isNotEmpty) { + return fallbackPath; + } + return value; +} + +String _appendRedirectQuery( + String path, + Map params, +) { + final uri = Uri.parse(path); + final merged = Map.from(uri.queryParameters); + for (final entry in params.entries) { + if (entry.value.trim().isEmpty) continue; + merged[entry.key] = entry.value; + } + final query = merged.entries + .map( + (entry) { + final key = Uri.encodeQueryComponent(entry.key); + final value = Uri.encodeQueryComponent(entry.value); + return '$key=$value'; + }, + ) + .join('&'); + return query.isEmpty ? uri.path : '${uri.path}?$query'; +} + +bool _isTruthy(String value) { + final normalized = value.trim().toLowerCase(); + return normalized == 'true' || normalized == '1' || normalized == 'yes'; +} diff --git a/packages/dashboard/lib/src/services/models.dart b/packages/dashboard/lib/src/services/models.dart index 54730d0..42681ef 100644 --- a/packages/dashboard/lib/src/services/models.dart +++ b/packages/dashboard/lib/src/services/models.dart @@ -1,4 +1,14 @@ -import 'package:stem/stem.dart' show QueueHeartbeat, WorkerHeartbeat, stemNow; +import 'package:stem/stem.dart' + show + QueueHeartbeat, + RunState, + TaskState, + TaskStatus, + TaskStatusRecord, + WorkerHeartbeat, + WorkflowStatus, + WorkflowStepEntry, + stemNow; /// Aggregate counts for a queue at a point in time. class QueueSummary { @@ -181,6 +191,604 @@ class DashboardEvent { final Map metadata; } +/// Audit log entry for operator actions and automated alerts. +class DashboardAuditEntry { + /// Creates an audit log entry. + const DashboardAuditEntry({ + required this.id, + required this.timestamp, + required this.kind, + required this.action, + required this.status, + this.actor, + this.summary, + this.metadata = const {}, + }); + + /// Stable entry identifier. + final String id; + + /// Event timestamp. + final DateTime timestamp; + + /// Entry kind: `action` or `alert`. + final String kind; + + /// Action/event type identifier. + final String action; + + /// Status marker (`ok`, `error`, `sent`, `skipped`, etc.). + final String status; + + /// Actor identifier where applicable. + final String? actor; + + /// Human-readable summary. + final String? summary; + + /// Optional metadata payload. + final Map metadata; +} + +/// Dashboard-friendly projection of a persisted task status record. +class DashboardTaskStatusEntry { + /// Creates a task status entry. + const DashboardTaskStatusEntry({ + required this.id, + required this.state, + required this.attempt, + required this.createdAt, + required this.updatedAt, + required this.queue, + required this.taskName, + this.errorMessage, + this.errorType, + this.errorStack, + this.payload, + this.meta = const {}, + this.runId, + this.workflowName, + this.workflowStep, + this.workflowStepIndex, + this.workflowIteration, + this.retryable = false, + }); + + /// Builds a dashboard task entry from a [TaskStatusRecord]. + factory DashboardTaskStatusEntry.fromRecord(TaskStatusRecord record) { + final status = record.status; + final meta = status.meta; + final error = status.error; + final queue = _readQueue(meta); + final taskName = _readTaskName(meta); + return DashboardTaskStatusEntry( + id: status.id, + state: status.state, + attempt: status.attempt, + createdAt: record.createdAt, + updatedAt: record.updatedAt, + queue: queue, + taskName: taskName, + errorMessage: error?.message, + errorType: error?.type, + errorStack: error?.stack, + payload: status.payload, + meta: meta, + runId: meta['stem.workflow.runId']?.toString(), + workflowName: meta['stem.workflow.name']?.toString(), + workflowStep: meta['stem.workflow.step']?.toString(), + workflowStepIndex: _readInt(meta['stem.workflow.stepIndex']), + workflowIteration: _readInt(meta['stem.workflow.iteration']), + retryable: error?.retryable ?? false, + ); + } + + /// Builds a dashboard task entry from a plain [TaskStatus]. + /// + /// Use this when the result backend can return the current status but not + /// the persisted record timestamps. + factory DashboardTaskStatusEntry.fromStatus( + TaskStatus status, { + DateTime? observedAt, + }) { + final seenAt = observedAt?.toUtc() ?? stemNow().toUtc(); + final meta = status.meta; + final queue = _readQueue(meta); + final taskName = _readTaskName(meta); + final error = status.error; + return DashboardTaskStatusEntry( + id: status.id, + state: status.state, + attempt: status.attempt, + createdAt: seenAt, + updatedAt: seenAt, + queue: queue, + taskName: taskName, + errorMessage: error?.message, + errorType: error?.type, + errorStack: error?.stack, + payload: status.payload, + meta: meta, + runId: meta['stem.workflow.runId']?.toString(), + workflowName: meta['stem.workflow.name']?.toString(), + workflowStep: meta['stem.workflow.step']?.toString(), + workflowStepIndex: _readInt(meta['stem.workflow.stepIndex']), + workflowIteration: _readInt(meta['stem.workflow.iteration']), + retryable: error?.retryable ?? false, + ); + } + + /// Task identifier. + final String id; + + /// Current lifecycle state. + final TaskState state; + + /// Attempt count for this status. + final int attempt; + + /// Record creation timestamp. + final DateTime createdAt; + + /// Record update timestamp. + final DateTime updatedAt; + + /// Queue associated with the task. + final String queue; + + /// Task handler name if available. + final String taskName; + + /// Failure message when [state] is failed/retried. + final String? errorMessage; + + /// Failure type when [state] is failed/retried. + final String? errorType; + + /// Failure stack trace when captured by the backend. + final String? errorStack; + + /// Persisted task result payload. + final Object? payload; + + /// Raw task metadata from the result backend. + final Map meta; + + /// Workflow run identifier, when this task is part of a workflow. + final String? runId; + + /// Workflow name, when present. + final String? workflowName; + + /// Workflow step name, when present. + final String? workflowStep; + + /// Workflow step index, when present. + final int? workflowStepIndex; + + /// Workflow iteration, when present. + final int? workflowIteration; + + /// Whether the failure is marked retryable. + final bool retryable; + + /// Namespace reported by task metadata, or `stem` when unavailable. + String get namespace => _readNamespace(meta); + + /// Whether this entry represents a workflow task. + bool get isWorkflowTask => + runId != null || + taskName.startsWith('stem.workflow.') || + taskName.contains('workflow'); + + /// Whether this entry is in a failed terminal state. + bool get isFailure => + state == TaskState.failed || state == TaskState.cancelled; + + /// Fingerprint used to group related failures in diagnostics views. + String get errorFingerprint { + final type = (errorType ?? 'Unknown').trim(); + final message = (errorMessage ?? 'No message').trim(); + return '$type: $message'; + } + + /// Task processing start timestamp, when recorded by workers. + DateTime? get startedAt => _readDate(meta['startedAt']); + + /// Task completion/failure timestamp, when recorded by workers. + DateTime? get finishedAt => + _readDate(meta['completedAt']) ?? _readDate(meta['failedAt']); + + /// Estimated queue wait from persisted record creation to processing start. + Duration? get queueWait { + final started = startedAt; + if (started == null) return null; + final value = started.difference(createdAt.toUtc()); + if (value.isNegative) return Duration.zero; + return value; + } + + /// Estimated processing time from start to finish/last update. + Duration? get processingTime { + final started = startedAt; + if (started == null) return null; + final end = finishedAt ?? updatedAt.toUtc(); + final value = end.difference(started); + if (value.isNegative) return Duration.zero; + return value; + } +} + +/// App-focused namespace summary for dashboard observability. +class DashboardNamespaceSnapshot { + /// Creates a namespace summary. + const DashboardNamespaceSnapshot({ + required this.namespace, + required this.queueCount, + required this.workerCount, + required this.pending, + required this.inflight, + required this.deadLetters, + required this.runningTasks, + required this.failedTasks, + required this.workflowRuns, + }); + + /// Namespace identifier. + final String namespace; + + /// Number of distinct queues seen for this namespace. + final int queueCount; + + /// Number of active workers in this namespace. + final int workerCount; + + /// Pending queue depth. + final int pending; + + /// In-flight envelope count. + final int inflight; + + /// Dead-letter count. + final int deadLetters; + + /// Running task statuses. + final int runningTasks; + + /// Failed terminal task statuses. + final int failedTasks; + + /// Distinct workflow run ids observed in task metadata. + final int workflowRuns; +} + +/// Aggregate task summary grouped by task name. +class DashboardJobSummary { + /// Creates a task/job summary. + const DashboardJobSummary({ + required this.taskName, + required this.sampleQueue, + required this.total, + required this.running, + required this.succeeded, + required this.failed, + required this.retried, + required this.cancelled, + required this.lastUpdated, + }); + + /// Task handler name. + final String taskName; + + /// Queue most commonly associated with this task in sampled statuses. + final String sampleQueue; + + /// Total sampled statuses for this task. + final int total; + + /// Running count. + final int running; + + /// Success count. + final int succeeded; + + /// Failure count. + final int failed; + + /// Retried count. + final int retried; + + /// Cancelled count. + final int cancelled; + + /// Most recent update timestamp across sampled statuses. + final DateTime lastUpdated; + + /// Failure ratio in sampled statuses. + double get failureRatio => total <= 0 ? 0 : failed / total; +} + +/// Workflow run summary projected from task status metadata. +class DashboardWorkflowRunSummary { + /// Creates a workflow summary. + const DashboardWorkflowRunSummary({ + required this.runId, + required this.workflowName, + required this.lastStep, + required this.total, + required this.queued, + required this.running, + required this.succeeded, + required this.failed, + required this.cancelled, + required this.lastUpdated, + }); + + /// Workflow run id. + final String runId; + + /// Workflow name, when available. + final String workflowName; + + /// Most recent step marker, when available. + final String? lastStep; + + /// Total sampled statuses for this run. + final int total; + + /// Queued count. + final int queued; + + /// Running count. + final int running; + + /// Succeeded count. + final int succeeded; + + /// Failed count. + final int failed; + + /// Cancelled count. + final int cancelled; + + /// Most recent update timestamp. + final DateTime lastUpdated; +} + +/// Builds app-focused namespace summaries from sampled runtime state. +List buildNamespaceSnapshots({ + required List queues, + required List workers, + required List tasks, + String defaultNamespace = 'stem', +}) { + final queueNamesByNamespace = >{}; + final pendingByNamespace = {}; + final inflightByNamespace = {}; + final deadByNamespace = {}; + final workerCountByNamespace = {}; + final runningByNamespace = {}; + final failedByNamespace = {}; + final runsByNamespace = >{}; + + for (final queue in queues) { + queueNamesByNamespace.putIfAbsent(defaultNamespace, () => {}).add( + queue.queue, + ); + pendingByNamespace[defaultNamespace] = + (pendingByNamespace[defaultNamespace] ?? 0) + queue.pending; + inflightByNamespace[defaultNamespace] = + (inflightByNamespace[defaultNamespace] ?? 0) + queue.inflight; + deadByNamespace[defaultNamespace] = + (deadByNamespace[defaultNamespace] ?? 0) + queue.deadLetters; + } + + for (final worker in workers) { + final namespace = worker.namespace.trim().isEmpty + ? defaultNamespace + : worker.namespace.trim(); + workerCountByNamespace[namespace] = + (workerCountByNamespace[namespace] ?? 0) + 1; + final names = queueNamesByNamespace.putIfAbsent( + namespace, + () => {}, + ); + for (final queue in worker.queues) { + names.add(queue.name); + } + } + + for (final task in tasks) { + final namespace = task.namespace.trim().isEmpty + ? defaultNamespace + : task.namespace.trim(); + queueNamesByNamespace.putIfAbsent(namespace, () => {}).add( + task.queue, + ); + if (task.state == TaskState.running) { + runningByNamespace[namespace] = (runningByNamespace[namespace] ?? 0) + 1; + } + if (task.isFailure) { + failedByNamespace[namespace] = (failedByNamespace[namespace] ?? 0) + 1; + } + if (task.runId != null && task.runId!.isNotEmpty) { + runsByNamespace.putIfAbsent(namespace, () => {}).add(task.runId!); + } + } + + final namespaces = { + ...queueNamesByNamespace.keys, + ...workerCountByNamespace.keys, + ...runningByNamespace.keys, + ...failedByNamespace.keys, + ...runsByNamespace.keys, + }.toList(growable: false) + ..sort(); + + return namespaces.map((namespace) { + return DashboardNamespaceSnapshot( + namespace: namespace, + queueCount: queueNamesByNamespace[namespace]?.length ?? 0, + workerCount: workerCountByNamespace[namespace] ?? 0, + pending: pendingByNamespace[namespace] ?? 0, + inflight: inflightByNamespace[namespace] ?? 0, + deadLetters: deadByNamespace[namespace] ?? 0, + runningTasks: runningByNamespace[namespace] ?? 0, + failedTasks: failedByNamespace[namespace] ?? 0, + workflowRuns: runsByNamespace[namespace]?.length ?? 0, + ); + }).toList(growable: false); +} + +/// Builds task/job summaries grouped by task name. +List buildJobSummaries( + List tasks, { + int limit = 20, +}) { + final buckets = {}; + for (final task in tasks) { + buckets + .putIfAbsent( + task.taskName, + () => _DashboardJobSummaryBuilder(taskName: task.taskName), + ) + .add(task); + } + final results = buckets.values.map((bucket) => bucket.build()).toList() + ..sort((a, b) { + final byTotal = b.total.compareTo(a.total); + if (byTotal != 0) return byTotal; + return b.lastUpdated.compareTo(a.lastUpdated); + }); + final bounded = limit < 1 ? 1 : limit; + return results.take(bounded).toList(growable: false); +} + +/// Builds workflow run summaries grouped by run id. +List buildWorkflowRunSummaries( + List tasks, { + int limit = 20, +}) { + final buckets = {}; + for (final task in tasks) { + final runId = task.runId?.trim(); + if (runId == null || runId.isEmpty) continue; + buckets + .putIfAbsent(runId, () => _DashboardWorkflowSummaryBuilder(runId)) + .add(task); + } + final results = buckets.values.map((bucket) => bucket.build()).toList() + ..sort((a, b) => b.lastUpdated.compareTo(a.lastUpdated)); + final bounded = limit < 1 ? 1 : limit; + return results.take(bounded).toList(growable: false); +} + +/// Projection of a workflow run snapshot for dashboard rendering. +class DashboardWorkflowRunSnapshot { + /// Creates a workflow run snapshot. + const DashboardWorkflowRunSnapshot({ + required this.id, + required this.workflow, + required this.status, + required this.cursor, + required this.createdAt, + this.updatedAt, + this.waitTopic, + this.resumeAt, + this.ownerId, + this.leaseExpiresAt, + this.lastError, + this.result, + }); + + /// Builds a dashboard workflow run snapshot from [RunState]. + factory DashboardWorkflowRunSnapshot.fromRunState(RunState state) { + return DashboardWorkflowRunSnapshot( + id: state.id, + workflow: state.workflow, + status: state.status, + cursor: state.cursor, + createdAt: state.createdAt, + updatedAt: state.updatedAt, + waitTopic: state.waitTopic, + resumeAt: state.resumeAt, + ownerId: state.ownerId, + leaseExpiresAt: state.leaseExpiresAt, + lastError: state.lastError, + result: state.result, + ); + } + + /// Run identifier. + final String id; + + /// Workflow name. + final String workflow; + + /// Current lifecycle state. + final WorkflowStatus status; + + /// Next step cursor. + final int cursor; + + /// Run creation timestamp. + final DateTime createdAt; + + /// Most recent mutation timestamp. + final DateTime? updatedAt; + + /// Topic currently awaited by this run, when suspended. + final String? waitTopic; + + /// Resume deadline for suspended runs. + final DateTime? resumeAt; + + /// Owner of the active lease when running. + final String? ownerId; + + /// Lease expiration if the run is claimed. + final DateTime? leaseExpiresAt; + + /// Last error payload recorded by the workflow runtime. + final Map? lastError; + + /// Final workflow result payload when completed. + final Object? result; +} + +/// Projection of a persisted workflow step checkpoint. +class DashboardWorkflowStepSnapshot { + /// Creates a workflow step snapshot. + const DashboardWorkflowStepSnapshot({ + required this.name, + required this.position, + required this.value, + this.completedAt, + }); + + /// Builds a workflow step snapshot from [WorkflowStepEntry]. + factory DashboardWorkflowStepSnapshot.fromEntry(WorkflowStepEntry entry) { + return DashboardWorkflowStepSnapshot( + name: entry.name, + position: entry.position, + value: entry.value, + completedAt: entry.completedAt, + ); + } + + /// Step name. + final String name; + + /// Step ordering position. + final int position; + + /// Persisted checkpoint value. + final Object? value; + + /// Completion timestamp if available. + final DateTime? completedAt; +} + /// Task request submitted from the dashboard UI. class EnqueueRequest { /// Creates a task enqueue request. @@ -207,3 +815,133 @@ class EnqueueRequest { /// Maximum retry count for the task. final int maxRetries; } + +int? _readInt(Object? value) { + if (value == null) return null; + if (value is int) return value; + if (value is num) return value.toInt(); + return int.tryParse(value.toString()); +} + +DateTime? _readDate(Object? value) { + if (value == null) return null; + if (value is DateTime) return value.toUtc(); + return DateTime.tryParse(value.toString())?.toUtc(); +} + +String _readTaskName(Map meta) { + return meta['task']?.toString() ?? + meta['stem.task']?.toString() ?? + meta['name']?.toString() ?? + meta['taskName']?.toString() ?? + 'unknown'; +} + +String _readQueue(Map meta) { + return meta['queue']?.toString() ?? + meta['stem.queue']?.toString() ?? + 'default'; +} + +String _readNamespace(Map meta) { + return meta['namespace']?.toString() ?? + meta['stem.namespace']?.toString() ?? + 'stem'; +} + +class _DashboardJobSummaryBuilder { + _DashboardJobSummaryBuilder({required this.taskName}); + + final String taskName; + final Map _queueHits = {}; + var _total = 0; + var _running = 0; + var _succeeded = 0; + var _failed = 0; + var _retried = 0; + var _cancelled = 0; + DateTime _lastUpdated = DateTime.fromMillisecondsSinceEpoch(0, isUtc: true); + + void add(DashboardTaskStatusEntry task) { + _total += 1; + _queueHits[task.queue] = (_queueHits[task.queue] ?? 0) + 1; + if (task.state == TaskState.running) _running += 1; + if (task.state == TaskState.succeeded) _succeeded += 1; + if (task.state == TaskState.failed) _failed += 1; + if (task.state == TaskState.retried) _retried += 1; + if (task.state == TaskState.cancelled) _cancelled += 1; + if (task.updatedAt.toUtc().isAfter(_lastUpdated)) { + _lastUpdated = task.updatedAt.toUtc(); + } + } + + DashboardJobSummary build() { + final sampleQueue = _queueHits.entries.isEmpty + ? 'default' + : (_queueHits.entries.toList() + ..sort((a, b) => b.value.compareTo(a.value))) + .first + .key; + return DashboardJobSummary( + taskName: taskName, + sampleQueue: sampleQueue, + total: _total, + running: _running, + succeeded: _succeeded, + failed: _failed, + retried: _retried, + cancelled: _cancelled, + lastUpdated: _lastUpdated, + ); + } +} + +class _DashboardWorkflowSummaryBuilder { + _DashboardWorkflowSummaryBuilder(this.runId); + + final String runId; + String _workflowName = 'workflow'; + String? _lastStep; + var _total = 0; + var _queued = 0; + var _running = 0; + var _succeeded = 0; + var _failed = 0; + var _cancelled = 0; + DateTime _lastUpdated = DateTime.fromMillisecondsSinceEpoch(0, isUtc: true); + + void add(DashboardTaskStatusEntry task) { + _total += 1; + if (task.workflowName != null && task.workflowName!.isNotEmpty) { + _workflowName = task.workflowName!; + } + if (task.workflowStep != null && task.workflowStep!.isNotEmpty) { + _lastStep = task.workflowStep; + } + if (task.state == TaskState.queued || task.state == TaskState.retried) { + _queued += 1; + } + if (task.state == TaskState.running) _running += 1; + if (task.state == TaskState.succeeded) _succeeded += 1; + if (task.state == TaskState.failed) _failed += 1; + if (task.state == TaskState.cancelled) _cancelled += 1; + if (task.updatedAt.toUtc().isAfter(_lastUpdated)) { + _lastUpdated = task.updatedAt.toUtc(); + } + } + + DashboardWorkflowRunSummary build() { + return DashboardWorkflowRunSummary( + runId: runId, + workflowName: _workflowName, + lastStep: _lastStep, + total: _total, + queued: _queued, + running: _running, + succeeded: _succeeded, + failed: _failed, + cancelled: _cancelled, + lastUpdated: _lastUpdated, + ); + } +} diff --git a/packages/dashboard/lib/src/services/stem_service.dart b/packages/dashboard/lib/src/services/stem_service.dart index 4eb872d..b90ded0 100644 --- a/packages/dashboard/lib/src/services/stem_service.dart +++ b/packages/dashboard/lib/src/services/stem_service.dart @@ -1,10 +1,13 @@ import 'dart:async'; +import 'dart:io'; import 'package:stem/stem.dart'; import 'package:stem_cli/stem_cli.dart'; - import 'package:stem_dashboard/src/config/config.dart'; import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_postgres/stem_postgres.dart'; +import 'package:stem_redis/stem_redis.dart'; +import 'package:stem_sqlite/stem_sqlite.dart'; /// Contract for dashboard services that load queue and worker data. abstract class DashboardDataSource { @@ -14,6 +17,29 @@ abstract class DashboardDataSource { /// Fetches current worker status snapshots. Future> fetchWorkerStatuses(); + /// Fetches persisted task statuses for observability views. + Future> fetchTaskStatuses({ + TaskState? state, + String? queue, + int limit = 100, + int offset = 0, + }); + + /// Fetches a single task status by [taskId]. + Future fetchTaskStatus(String taskId); + + /// Fetches statuses belonging to a workflow [runId]. + Future> fetchTaskStatusesForRun( + String runId, { + int limit = 200, + }); + + /// Fetches persisted workflow run snapshot, if a workflow store is available. + Future fetchWorkflowRun(String runId); + + /// Fetches persisted workflow checkpoints, if a workflow store is available. + Future> fetchWorkflowSteps(String runId); + /// Enqueues a task request through the backing broker. Future enqueueTask(EnqueueRequest request); @@ -24,6 +50,20 @@ abstract class DashboardDataSource { bool dryRun = false, }); + /// Replays a specific dead-letter task by [taskId]. + /// + /// Returns `true` when the entry was found and replayed. + Future replayTaskById(String taskId, {String? queue}); + + /// Requests revocation for [taskId]. + /// + /// Returns `true` when a revoke store is configured and the request is saved. + Future revokeTask( + String taskId, { + bool terminate = false, + String? reason, + }); + /// Sends a control command and returns any replies collected. Future> sendControlCommand( ControlCommandMessage command, { @@ -40,28 +80,57 @@ class StemDashboardService implements DashboardDataSource { required DashboardConfig config, required Broker broker, ResultBackend? backend, + WorkflowStore? workflowStore, + RevokeStore? revokeStore, + Future Function()? disposeContext, + Future<_DashboardRuntimeContext> Function()? reloadRuntimeContext, + bool ownsWorkflowStore = false, }) : _config = config, _namespace = config.namespace, + _signer = PayloadSigner.maybe(config.stem.signing), _broker = broker, - _backend = backend; + _backend = backend, + _workflowStore = workflowStore, + _revokeStore = revokeStore, + _disposeContext = disposeContext, + _reloadRuntimeContext = reloadRuntimeContext, + _ownsWorkflowStore = ownsWorkflowStore; final DashboardConfig _config; final String _namespace; - final Broker _broker; - final ResultBackend? _backend; + final PayloadSigner? _signer; + Broker _broker; + ResultBackend? _backend; + final WorkflowStore? _workflowStore; + RevokeStore? _revokeStore; + Future Function()? _disposeContext; + final Future<_DashboardRuntimeContext> Function()? _reloadRuntimeContext; + Future? _runtimeReconnectFuture; + Future _runtimeOperationQueue = Future.value(); + final bool _ownsWorkflowStore; + var _closed = false; /// Creates a dashboard service using [config]. /// /// Uses [createDefaultContext] to set up broker and backend from environment. static Future connect(DashboardConfig config) async { - final ctx = await createDefaultContext( - environment: Map.from(config.environment), + final runtimeContext = await _createRuntimeContext(config); + + final workflowStore = await _connectWorkflowStore( + config.environment['STEM_WORKFLOW_STORE_URL'], + namespace: _resolveWorkflowNamespace(config), + tls: config.tls, ); return StemDashboardService._( config: config, - broker: ctx.broker, - backend: ctx.backend, + broker: runtimeContext.broker, + backend: runtimeContext.backend, + workflowStore: workflowStore, + revokeStore: runtimeContext.revokeStore, + disposeContext: runtimeContext.dispose, + reloadRuntimeContext: () => _createRuntimeContext(config), + ownsWorkflowStore: true, ); } @@ -73,47 +142,164 @@ class StemDashboardService implements DashboardDataSource { required DashboardConfig config, required Broker broker, ResultBackend? backend, + WorkflowStore? workflowStore, + RevokeStore? revokeStore, }) async { return StemDashboardService._( config: config, broker: broker, backend: backend, + workflowStore: workflowStore, + revokeStore: revokeStore, ); } @override Future> fetchQueueSummaries() async { - final queues = await _discoverQueues(); - final summaries = []; + try { + return await _withRuntimeReconnectRetry(_fetchQueueSummariesImpl); + } on Object catch (error, stack) { + _logReadFailure('fetchQueueSummaries', error, stack); + return const []; + } + } - for (final queue in queues) { - final pending = await _broker.pendingCount(queue) ?? 0; - final inflight = await _broker.inflightCount(queue) ?? 0; - final dead = await _deadLetterCount(queue); + @override + Future> fetchWorkerStatuses() async { + try { + final heartbeats = await _withRuntimeReconnectRetry(() async { + final backend = _backend; + if (backend == null) return const []; + return backend.listWorkerHeartbeats(); + }); + return heartbeats.map(WorkerStatus.fromHeartbeat).toList(growable: false) + ..sort((a, b) => a.workerId.compareTo(b.workerId)); + } on Object catch (error, stack) { + _logReadFailure('fetchWorkerStatuses', error, stack); + return const []; + } + } - summaries.add( - QueueSummary( - queue: queue, - pending: pending, - inflight: inflight, - deadLetters: dead, - ), - ); + @override + Future> fetchTaskStatuses({ + TaskState? state, + String? queue, + int limit = 100, + int offset = 0, + }) async { + final resolvedQueue = queue?.trim(); + final boundedLimit = limit.clamp(1, 500); + final boundedOffset = offset < 0 ? 0 : offset; + try { + final page = await _withRuntimeReconnectRetry(() async { + final backend = _backend; + if (backend == null) { + return const TaskStatusPage(items: []); + } + return backend.listTaskStatuses( + TaskStatusListRequest( + state: state, + queue: resolvedQueue == null || resolvedQueue.isEmpty + ? null + : resolvedQueue, + limit: boundedLimit, + offset: boundedOffset, + ), + ); + }); + return page.items + .map(DashboardTaskStatusEntry.fromRecord) + .toList(growable: false); + } on Object catch (error, stack) { + _logReadFailure('fetchTaskStatuses', error, stack); + return const []; } + } - summaries.sort((a, b) => a.queue.compareTo(b.queue)); - return summaries; + @override + Future fetchTaskStatus(String taskId) async { + final trimmed = taskId.trim(); + if (trimmed.isEmpty) return null; + + try { + final record = await _findTaskStatusRecord(trimmed); + if (record != null) { + return DashboardTaskStatusEntry.fromRecord(record); + } + final backend = _backend; + if (backend == null) return null; + final status = await backend.get(trimmed); + if (status == null) { + return null; + } + return DashboardTaskStatusEntry.fromStatus(status); + } on Object { + return null; + } } @override - Future> fetchWorkerStatuses() async { - final backend = _backend; - if (backend == null) return const []; + Future> fetchTaskStatusesForRun( + String runId, { + int limit = 200, + }) async { + final trimmed = runId.trim(); + if (trimmed.isEmpty) return const []; try { - final heartbeats = await backend.listWorkerHeartbeats(); - return heartbeats.map(WorkerStatus.fromHeartbeat).toList(growable: false) - ..sort((a, b) => a.workerId.compareTo(b.workerId)); + final page = await _withRuntimeReconnectRetry(() async { + final backend = _backend; + if (backend == null) { + return const TaskStatusPage(items: []); + } + return backend.listTaskStatuses( + TaskStatusListRequest( + meta: {'stem.workflow.runId': trimmed}, + limit: limit.clamp(1, 500), + ), + ); + }); + return page.items + .map(DashboardTaskStatusEntry.fromRecord) + .toList(growable: false); + } on Object { + return const []; + } + } + + @override + Future fetchWorkflowRun(String runId) async { + final store = _workflowStore; + if (store == null) return null; + + final trimmed = runId.trim(); + if (trimmed.isEmpty) return null; + + try { + final run = await store.get(trimmed); + if (run == null) return null; + return DashboardWorkflowRunSnapshot.fromRunState(run); + } on Object { + return null; + } + } + + @override + Future> fetchWorkflowSteps( + String runId, + ) async { + final store = _workflowStore; + if (store == null) return const []; + + final trimmed = runId.trim(); + if (trimmed.isEmpty) return const []; + + try { + final steps = await store.listSteps(trimmed); + return steps + .map(DashboardWorkflowStepSnapshot.fromEntry) + .toList(growable: false) + ..sort((a, b) => a.position.compareTo(b.position)); } on Object { return const []; } @@ -130,7 +316,7 @@ class StemDashboardService implements DashboardDataSource { maxRetries: request.maxRetries, meta: const {'source': 'dashboard'}, ); - await _broker.publish(envelope); + await _publishEnvelope(envelope); } @override @@ -140,7 +326,91 @@ class StemDashboardService implements DashboardDataSource { bool dryRun = false, }) async { final bounded = limit.clamp(1, 500); - return _broker.replayDeadLetters(queue, limit: bounded, dryRun: dryRun); + return _withRuntimeReconnectRetry( + () => _broker.replayDeadLetters(queue, limit: bounded, dryRun: dryRun), + ); + } + + @override + Future replayTaskById(String taskId, {String? queue}) async { + final trimmedTask = taskId.trim(); + if (trimmedTask.isEmpty) return false; + + final resolvedQueue = await _resolveReplayQueue(trimmedTask, queue: queue); + if (resolvedQueue == null) { + return false; + } + + final deadLetter = await _withRuntimeReconnectRetry( + () => _broker.getDeadLetter(resolvedQueue, trimmedTask), + ); + if (deadLetter == null) { + return false; + } + + final now = stemNow().toUtc(); + final original = deadLetter.envelope; + final replayMeta = Map.from(original.meta) + ..['source'] = 'dashboard' + ..['dashboard.replayFromTaskId'] = trimmedTask + ..['dashboard.replayedAt'] = now.toIso8601String(); + final replay = original.copyWith( + id: generateEnvelopeId(), + attempt: 0, + enqueuedAt: now, + meta: replayMeta, + ); + await _publishEnvelope(replay); + + final backend = _backend; + if (backend != null) { + final queuedMeta = { + 'queue': replay.queue, + 'task': replay.name, + ...replayMeta, + }; + await backend.set( + replay.id, + TaskState.queued, + attempt: 0, + meta: queuedMeta, + ); + } + return true; + } + + @override + Future revokeTask( + String taskId, { + bool terminate = false, + String? reason, + }) async { + final trimmedTask = taskId.trim(); + if (trimmedTask.isEmpty) return false; + + final now = stemNow().toUtc(); + final trimmedReason = reason?.trim(); + final entry = RevokeEntry( + namespace: _namespace, + taskId: trimmedTask, + version: generateRevokeVersion(), + issuedAt: now, + terminate: terminate, + reason: trimmedReason == null || trimmedReason.isEmpty + ? null + : trimmedReason, + requestedBy: 'dashboard', + ); + try { + final store = _revokeStore; + if (store == null) { + return false; + } + await store.upsertAll([entry]); + return true; + } on Object { + return false; + } } @override @@ -172,7 +442,7 @@ class StemDashboardService implements DashboardDataSource { meta: const {'source': 'dashboard'}, enqueuedAt: now, ); - await _broker.publish(envelope); + await _publishEnvelope(envelope); } final expectedReplies = command.targets.isEmpty @@ -180,14 +450,18 @@ class StemDashboardService implements DashboardDataSource { : command.targets.length; final prefetch = expectedReplies == null ? 8 : expectedReplies.clamp(1, 32); - final subscription = _broker.consume( - RoutingSubscription.singleQueue(replyQueue), - consumerGroup: _controlConsumerGroup, - consumerName: 'dashboard-${command.requestId}', - prefetch: prefetch, + final subscription = await _withRuntimeReconnectRetry>( + () async { + return _broker.consume( + RoutingSubscription.singleQueue(replyQueue), + consumerGroup: _controlConsumerGroup, + consumerName: 'dashboard-${command.requestId}', + prefetch: prefetch, + ); + }, ); - final iterator = StreamIterator(subscription); + final iterator = StreamIterator(subscription); final replies = []; final deadline = stemNow().add(timeout); @@ -210,9 +484,11 @@ class StemDashboardService implements DashboardDataSource { try { final reply = controlReplyFromEnvelope(delivery.envelope); replies.add(reply); - await _broker.ack(delivery); + await _withRuntimeReconnectRetry(() => _broker.ack(delivery)); } on Object { - await _broker.nack(delivery, requeue: false); + await _withRuntimeReconnectRetry( + () => _broker.nack(delivery, requeue: false), + ); } if (expectedReplies != null && replies.length >= expectedReplies) { @@ -229,9 +505,155 @@ class StemDashboardService implements DashboardDataSource { @override Future close() async { - // Note: The broker and backend will be closed when the context is disposed. - // Since we got them from createDefaultContext, we don't own their - // lifecycle. + if (_closed) return; + _closed = true; + + if (_ownsWorkflowStore) { + await _disposeWorkflowStore(_workflowStore); + } + + await _disposeRuntimeContext(); + } + + Future> _fetchQueueSummariesImpl() async { + final queues = await _discoverQueues(); + final summaries = []; + + for (final queue in queues) { + final pending = await _broker.pendingCount(queue) ?? 0; + final inflight = await _broker.inflightCount(queue) ?? 0; + final dead = await _deadLetterCount(queue); + + summaries.add( + QueueSummary( + queue: queue, + pending: pending, + inflight: inflight, + deadLetters: dead, + ), + ); + } + + summaries.sort((a, b) => a.queue.compareTo(b.queue)); + return summaries; + } + + Future _withRuntimeReconnectRetry(Future Function() action) { + return _serializeRuntimeAccess(() async { + try { + return await action(); + } on Object catch (error) { + final recovered = await _recoverRuntimeContextIfNeeded(error); + if (!recovered) { + rethrow; + } + return action(); + } + }); + } + + Future _recoverRuntimeContextIfNeeded(Object error) async { + if (_closed || !_isRecoverableConnectionError(error)) { + return false; + } + + final reloadRuntimeContext = _reloadRuntimeContext; + if (reloadRuntimeContext == null) { + return false; + } + + try { + await _reconnectRuntimeContext(reloadRuntimeContext); + return true; + } on Object { + return false; + } + } + + bool _isRecoverableConnectionError(Object error) { + if (error is SocketException || + error is IOException || + error is StateError) { + return true; + } + final message = '$error'.toLowerCase(); + return message.contains('streamsink is closed') || + message.contains('stream is closed') || + message.contains('connection closed') || + message.contains('not connected') || + message.contains('connection refused') || + message.contains('socket is closed') || + message.contains('broken pipe') || + message.contains('timed out') || + message.contains('connection reset'); + } + + Future _serializeRuntimeAccess(Future Function() action) { + final completer = Completer(); + _runtimeOperationQueue = _runtimeOperationQueue.catchError((_) {}).then(( + _, + ) async { + try { + completer.complete(await action()); + } on Object catch (error, stack) { + completer.completeError(error, stack); + } + }); + return completer.future; + } + + void _logReadFailure(String operation, Object error, StackTrace stack) { + stemLogger.warning( + 'Dashboard data read failed', + stemLogContext( + component: 'dashboard', + subsystem: 'service', + fields: { + 'operation': operation, + 'error': '$error', + 'stack': '$stack', + }, + ), + ); + } + + Future _reconnectRuntimeContext( + Future<_DashboardRuntimeContext> Function() reloadRuntimeContext, + ) async { + if (_runtimeReconnectFuture != null) { + return _runtimeReconnectFuture!; + } + final completer = Completer(); + _runtimeReconnectFuture = completer.future; + try { + final nextContext = await reloadRuntimeContext(); + final previousDispose = _disposeContext; + _broker = nextContext.broker; + _backend = nextContext.backend; + _revokeStore = nextContext.revokeStore; + _disposeContext = nextContext.dispose; + if (previousDispose != null) { + try { + await previousDispose(); + } on Object { + // Ignore disposal failures while recovering from connection errors. + } + } + completer.complete(); + } on Object catch (error, stack) { + completer.completeError(error, stack); + rethrow; + } finally { + _runtimeReconnectFuture = null; + } + } + + Future _disposeRuntimeContext() async { + final disposeContext = _disposeContext; + _disposeContext = null; + if (disposeContext != null) { + await disposeContext(); + } } Future> _discoverQueues() async { @@ -294,11 +716,159 @@ class StemDashboardService implements DashboardDataSource { Future _purgeQueue(String queue) async { try { - await _broker.purge(queue); + await _withRuntimeReconnectRetry(() => _broker.purge(queue)); } on Object { // Some brokers may not support purge; ignore failures. } } + Future _publishEnvelope(Envelope envelope) async { + final signer = _signer; + final payload = signer == null ? envelope : await signer.sign(envelope); + await _withRuntimeReconnectRetry(() => _broker.publish(payload)); + } + + Future _findTaskStatusRecord(String taskId) async { + final backend = _backend; + if (backend == null) return null; + + var offset = 0; + const pageSize = 200; + const maxPages = 10; + + for (var pageIndex = 0; pageIndex < maxPages; pageIndex++) { + final page = await backend.listTaskStatuses( + TaskStatusListRequest(limit: pageSize, offset: offset), + ); + for (final item in page.items) { + if (item.status.id == taskId) { + return item; + } + } + final nextOffset = page.nextOffset; + if (nextOffset == null) { + break; + } + offset = nextOffset; + } + return null; + } + + Future _resolveReplayQueue( + String taskId, { + String? queue, + }) async { + final explicit = queue?.trim(); + if (explicit != null && explicit.isNotEmpty) { + return explicit; + } + + final status = await fetchTaskStatus(taskId); + final statusQueue = status?.queue.trim(); + if (statusQueue != null && statusQueue.isNotEmpty) { + return statusQueue; + } + + final queues = await _discoverQueues(); + for (final candidate in queues) { + final entry = await _broker.getDeadLetter(candidate, taskId); + if (entry != null) { + return candidate; + } + } + return null; + } + + static String _resolveWorkflowNamespace(DashboardConfig config) { + final raw = config.environment['STEM_WORKFLOW_NAMESPACE']?.trim(); + if (raw != null && raw.isNotEmpty) { + return raw; + } + return config.namespace; + } + + static Future _connectWorkflowStore( + String? url, { + required String namespace, + required TlsConfig tls, + }) async { + final trimmed = url?.trim(); + if (trimmed == null || trimmed.isEmpty) { + return null; + } + + final uri = Uri.parse(trimmed); + switch (uri.scheme) { + case 'redis': + case 'rediss': + return RedisWorkflowStore.connect( + trimmed, + namespace: namespace, + tls: tls, + ); + case 'postgres': + case 'postgresql': + case 'postgresql+ssl': + case 'postgres+ssl': + return PostgresWorkflowStore.connect( + trimmed, + namespace: namespace, + applicationName: 'stem-dashboard-workflow', + tls: tls, + ); + case 'sqlite': + final path = uri.path.isNotEmpty ? uri.path : 'workflow.sqlite'; + return SqliteWorkflowStore.open(File(path)); + case 'file': + return SqliteWorkflowStore.open(File(uri.toFilePath())); + case 'memory': + return InMemoryWorkflowStore(); + default: + return null; + } + } + + static Future _disposeWorkflowStore(WorkflowStore? store) async { + if (store is RedisWorkflowStore) { + await store.close(); + return; + } + if (store is PostgresWorkflowStore) { + await store.close(); + return; + } + if (store is SqliteWorkflowStore) { + await store.close(); + } + } + + static Future<_DashboardRuntimeContext> _createRuntimeContext( + DashboardConfig config, + ) async { + final context = await createDefaultContext( + environment: Map.from(config.environment), + ); + return _DashboardRuntimeContext( + broker: context.broker, + backend: context.backend, + revokeStore: context.revokeStore, + dispose: context.dispose, + ); + } + static const _controlConsumerGroup = 'stem-dashboard-control'; } + +class _DashboardRuntimeContext { + const _DashboardRuntimeContext({ + required this.broker, + required this.backend, + required this.revokeStore, + required this.dispose, + }); + + final Broker broker; + final ResultBackend? backend; + final RevokeStore? revokeStore; + final Future Function() dispose; +} diff --git a/packages/dashboard/lib/src/state/dashboard_state.dart b/packages/dashboard/lib/src/state/dashboard_state.dart index 1b4e193..12b1d9e 100644 --- a/packages/dashboard/lib/src/state/dashboard_state.dart +++ b/packages/dashboard/lib/src/state/dashboard_state.dart @@ -1,8 +1,10 @@ import 'dart:async'; +import 'dart:convert'; +import 'dart:io'; import 'package:meta/meta.dart'; import 'package:routed_hotwire/routed_hotwire.dart'; -import 'package:stem/stem.dart' show stemNow; +import 'package:stem/stem.dart' show TaskState, stemNow; import 'package:stem_dashboard/src/services/models.dart'; import 'package:stem_dashboard/src/services/stem_service.dart'; import 'package:stem_dashboard/src/ui/event_templates.dart'; @@ -14,6 +16,12 @@ class DashboardState { required this.service, this.pollInterval = const Duration(seconds: 5), this.eventLimit = 200, + this.auditLimit = 300, + this.alertWebhookUrls = const [], + this.alertBacklogThreshold = 500, + this.alertFailedTaskThreshold = 25, + this.alertOfflineWorkerThreshold = 1, + this.alertCooldown = const Duration(minutes: 5), }) : hub = TurboStreamHub(); /// Data source used to fetch queues and workers. @@ -28,10 +36,34 @@ class DashboardState { /// Maximum number of events retained in memory. final int eventLimit; + /// Maximum number of audit entries retained in memory. + final int auditLimit; + + /// Webhook URLs used for alert delivery. + final List alertWebhookUrls; + + /// Backlog threshold triggering an alert. + final int alertBacklogThreshold; + + /// Failed-task threshold triggering an alert. + final int alertFailedTaskThreshold; + + /// Offline-worker threshold triggering an alert. + final int alertOfflineWorkerThreshold; + + /// Minimum duration between repeated alerts of the same type. + final Duration alertCooldown; + Timer? _timer; List _previousQueues = const []; Map _previousWorkers = const {}; + String _previousQueueSignature = ''; + String _previousWorkerSignature = ''; + String _previousTaskSignature = ''; + var _hasPrimedRefresh = false; final _events = []; + final _auditEntries = []; + final _lastAlertAt = {}; Future _polling = Future.value(); DateTime? _lastPollAt; DashboardThroughput _throughput = const DashboardThroughput( @@ -46,6 +78,10 @@ class DashboardState { /// Most recent throughput calculation. DashboardThroughput get throughput => _throughput; + /// Recent audit entries in reverse chronological order. + List get auditEntries => + List.unmodifiable(_auditEntries); + /// Starts the polling loop and emits initial state. Future start() async { await _runPoll(); @@ -67,14 +103,35 @@ class DashboardState { Future runOnce() => _poll(); Future _poll() async { - final queues = await service.fetchQueueSummaries(); - final workers = await service.fetchWorkerStatuses(); + final queueFuture = service.fetchQueueSummaries(); + final workerFuture = service.fetchWorkerStatuses(); + final taskFuture = service.fetchTaskStatuses(limit: 120); + + final queues = await queueFuture; + final workers = await workerFuture; + final tasks = await taskFuture; _updateThroughput(queues); _generateQueueEvents(_previousQueues, queues); _generateWorkerEvents(_previousWorkers, { for (final worker in workers) worker.workerId: worker, }); + await _evaluateAlerts(queues: queues, workers: workers, tasks: tasks); + + final queueSignature = _queueSignature(queues); + final workerSignature = _workerSignature(workers); + final taskSignature = _taskSignature(tasks); + final changed = + queueSignature != _previousQueueSignature || + workerSignature != _previousWorkerSignature || + taskSignature != _previousTaskSignature; + if (_hasPrimedRefresh && changed) { + _broadcastRefreshSignal(); + } + _hasPrimedRefresh = true; + _previousQueueSignature = queueSignature; + _previousWorkerSignature = workerSignature; + _previousTaskSignature = taskSignature; _previousQueues = queues; _previousWorkers = {for (final worker in workers) worker.workerId: worker}; @@ -256,4 +313,241 @@ class DashboardState { if (delta < 0) return 'decreased by ${delta.abs()}'; return 'unchanged'; } + + String _queueSignature(List queues) { + final sorted = List.from(queues) + ..sort((a, b) => a.queue.compareTo(b.queue)); + return sorted + .map( + (queue) { + return '${queue.queue}:${queue.pending}:' + '${queue.inflight}:${queue.deadLetters}'; + }, + ) + .join('|'); + } + + String _workerSignature(List workers) { + final sorted = List.from(workers) + ..sort((a, b) => a.workerId.compareTo(b.workerId)); + return sorted + .map( + (worker) { + final stamp = worker.timestamp.toUtc().toIso8601String(); + return '${worker.workerId}:${worker.inflight}:$stamp'; + }, + ) + .join('|'); + } + + String _taskSignature(List tasks) { + return tasks + .map( + (task) { + final stamp = task.updatedAt.toUtc().toIso8601String(); + return '${task.id}:${task.state.name}:${task.attempt}:$stamp'; + }, + ) + .join('|'); + } + + void _broadcastRefreshSignal() { + final payload = turboStreamReplace( + target: 'dashboard-refresh-signal', + html: '${stemNow().toUtc().toIso8601String()}', + ); + hub.broadcast('stem-dashboard:refresh', [payload]); + } + + /// Records an audit entry. + void recordAudit({ + required String kind, + required String action, + required String status, + String? actor, + String? summary, + Map metadata = const {}, + }) { + final entry = DashboardAuditEntry( + id: 'audit-${stemNow().toUtc().microsecondsSinceEpoch}', + timestamp: stemNow().toUtc(), + kind: kind, + action: action, + status: status, + actor: actor, + summary: summary, + metadata: metadata, + ); + _auditEntries.insert(0, entry); + if (_auditEntries.length > auditLimit) { + _auditEntries.removeRange(auditLimit, _auditEntries.length); + } + _broadcastRefreshSignal(); + } + + Future _evaluateAlerts({ + required List queues, + required List workers, + required List tasks, + }) async { + final totalPending = queues.fold( + 0, + (total, queue) => total + queue.pending, + ); + if (totalPending >= alertBacklogThreshold) { + await _emitAlert( + key: 'queue.backlog.high', + summary: 'Backlog threshold exceeded: ' + '$totalPending >= $alertBacklogThreshold.', + metadata: { + 'pendingTotal': totalPending, + 'threshold': alertBacklogThreshold, + }, + ); + } + + final failedCount = tasks.where((task) { + return task.state == TaskState.failed || + task.state == TaskState.cancelled; + }).length; + if (failedCount >= alertFailedTaskThreshold) { + await _emitAlert( + key: 'tasks.failed.high', + summary: 'Failed task threshold exceeded: ' + '$failedCount >= $alertFailedTaskThreshold.', + metadata: { + 'failedCount': failedCount, + 'threshold': alertFailedTaskThreshold, + }, + ); + } + + final offlineWorkers = workers.where( + (worker) => worker.age > const Duration(minutes: 2), + ); + if (offlineWorkers.length >= alertOfflineWorkerThreshold) { + await _emitAlert( + key: 'workers.offline.high', + summary: + 'Offline workers threshold exceeded: ${offlineWorkers.length} >= ' + '$alertOfflineWorkerThreshold.', + metadata: { + 'offlineWorkers': offlineWorkers + .map((worker) => worker.workerId) + .toList( + growable: false, + ), + 'threshold': alertOfflineWorkerThreshold, + }, + ); + } + } + + Future _emitAlert({ + required String key, + required String summary, + Map metadata = const {}, + }) async { + final now = stemNow().toUtc(); + final last = _lastAlertAt[key]; + if (last != null && now.difference(last) < alertCooldown) { + return; + } + _lastAlertAt[key] = now; + + recordAudit( + kind: 'alert', + action: key, + status: 'triggered', + actor: 'system', + summary: summary, + metadata: metadata, + ); + _recordEvent( + DashboardEvent( + title: 'Alert: $key', + timestamp: now, + summary: summary, + metadata: metadata, + ), + ); + + if (alertWebhookUrls.isEmpty) { + recordAudit( + kind: 'alert', + action: key, + status: 'skipped', + actor: 'system', + summary: 'No alert webhook URLs configured.', + ); + return; + } + await _sendAlertWebhooks(key: key, summary: summary, metadata: metadata); + } + + Future _sendAlertWebhooks({ + required String key, + required String summary, + required Map metadata, + }) async { + final payload = { + 'kind': 'stem-dashboard-alert', + 'key': key, + 'summary': summary, + 'timestamp': stemNow().toUtc().toIso8601String(), + 'metadata': metadata, + }; + + for (final rawUrl in alertWebhookUrls) { + final url = rawUrl.trim(); + if (url.isEmpty) continue; + final uri = Uri.tryParse(url); + if (uri == null || !uri.hasScheme || uri.host.isEmpty) { + recordAudit( + kind: 'alert', + action: key, + status: 'error', + actor: 'system', + summary: 'Invalid webhook URL: $url', + ); + continue; + } + + HttpClientRequest? request; + try { + final client = HttpClient(); + request = await client.postUrl(uri); + request.headers.contentType = ContentType.json; + request.add(utf8.encode(jsonEncode(payload))); + final response = await request.close(); + if (response.statusCode >= 200 && response.statusCode < 300) { + recordAudit( + kind: 'alert', + action: key, + status: 'sent', + actor: 'system', + summary: 'Alert delivered to $url.', + ); + } else { + recordAudit( + kind: 'alert', + action: key, + status: 'error', + actor: 'system', + summary: 'Webhook returned HTTP ${response.statusCode} for $url.', + ); + } + client.close(force: true); + } on Object catch (error) { + request?.abort(); + recordAudit( + kind: 'alert', + action: key, + status: 'error', + actor: 'system', + summary: 'Webhook delivery failed for $url: $error', + ); + } + } + } } diff --git a/packages/dashboard/lib/src/ui/audit.dart b/packages/dashboard/lib/src/ui/audit.dart new file mode 100644 index 0000000..daddb6f --- /dev/null +++ b/packages/dashboard/lib/src/ui/audit.dart @@ -0,0 +1,76 @@ +// Public helper functions in this file are intentionally undocumented to keep +// UI template files lightweight. +// ignore_for_file: public_member_api_docs + +import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/shared.dart'; + +String buildAuditContent(List entries) { + final actions = entries.where((entry) => entry.kind == 'action').length; + final alerts = entries.where((entry) => entry.kind == 'alert').length; + final failures = entries.where((entry) => entry.status == 'error').length; + + return ''' + + +
+ ${buildMetricCard('Entries', formatInt(entries.length), 'Recent audit records retained in dashboard memory.')} + ${buildMetricCard('Actions', formatInt(actions), 'Control/replay/revoke operations initiated by operators.')} + ${buildMetricCard('Alerts', formatInt(alerts), 'Automated threshold alerts emitted by polling logic.')} + ${buildMetricCard('Errors', formatInt(failures), 'Entries with status=error requiring follow-up.')} +
+ +
+
+

Timeline

+
+ + + + + + + + + + + + + + ${entries.isEmpty ? ''' + + + +''' : entries.take(250).map((entry) => ''' + + + + + + + + + +''').join()} + +
TimeKindActionStatusActorSummaryMetadata
No audit entries yet.
${formatRelative(entry.timestamp)}${escapeHtml(entry.kind)}${escapeHtml(entry.action)}${escapeHtml(entry.status)}${escapeHtml(entry.actor ?? 'system')}${escapeHtml(entry.summary ?? '—')}${escapeHtml(_formatMetadata(entry.metadata))}
+
+'''; +} + +String _formatMetadata(Map metadata) { + if (metadata.isEmpty) return '—'; + final values = metadata.entries + .map((entry) => '${entry.key}=${entry.value}') + .toList(growable: false); + return values.join(', '); +} diff --git a/packages/dashboard/lib/src/ui/content.dart b/packages/dashboard/lib/src/ui/content.dart index 8ed220b..b88cc74 100644 --- a/packages/dashboard/lib/src/ui/content.dart +++ b/packages/dashboard/lib/src/ui/content.dart @@ -1,10 +1,17 @@ import 'package:stem_dashboard/src/services/models.dart'; +import 'package:stem_dashboard/src/ui/audit.dart'; import 'package:stem_dashboard/src/ui/events.dart'; +import 'package:stem_dashboard/src/ui/failures.dart'; +import 'package:stem_dashboard/src/ui/jobs.dart'; import 'package:stem_dashboard/src/ui/layout.dart'; +import 'package:stem_dashboard/src/ui/namespaces.dart'; import 'package:stem_dashboard/src/ui/options.dart'; import 'package:stem_dashboard/src/ui/overview.dart'; +import 'package:stem_dashboard/src/ui/search.dart'; +import 'package:stem_dashboard/src/ui/task_detail.dart'; import 'package:stem_dashboard/src/ui/tasks.dart'; import 'package:stem_dashboard/src/ui/workers.dart'; +import 'package:stem_dashboard/src/ui/workflows.dart'; export 'package:stem_dashboard/src/ui/options.dart'; @@ -13,19 +20,76 @@ String buildPageContent({ required DashboardPage page, required List queues, required List workers, + List taskStatuses = const [], + DashboardTaskStatusEntry? taskDetail, + List runTimeline = const [], + DashboardWorkflowRunSnapshot? workflowRun, + List workflowSteps = const [], + List auditEntries = const [], DashboardThroughput? throughput, List events = const [], + String defaultNamespace = 'stem', TasksPageOptions tasksOptions = const TasksPageOptions(), WorkersPageOptions workersOptions = const WorkersPageOptions(), + FailuresPageOptions failuresOptions = const FailuresPageOptions(), + SearchPageOptions searchOptions = const SearchPageOptions(), + NamespacesPageOptions namespacesOptions = const NamespacesPageOptions(), + WorkflowsPageOptions workflowsOptions = const WorkflowsPageOptions(), + JobsPageOptions jobsOptions = const JobsPageOptions(), }) { switch (page) { case DashboardPage.overview: - return buildOverviewContent(queues, workers, throughput); + return buildOverviewContent( + queues, + workers, + throughput, + taskStatuses, + defaultNamespace, + ); case DashboardPage.tasks: - return buildTasksContent(queues, tasksOptions); + return buildTasksContent(queues, tasksOptions, taskStatuses); + case DashboardPage.taskDetail: + return buildTaskDetailContent( + taskDetail, + runTimeline, + workflowRun, + workflowSteps, + ); + case DashboardPage.failures: + return buildFailuresContent(taskStatuses, failuresOptions); + case DashboardPage.search: + return buildSearchContent( + options: searchOptions, + queues: queues, + workers: workers, + taskStatuses: taskStatuses, + auditEntries: auditEntries, + ); + case DashboardPage.audit: + return buildAuditContent(auditEntries); case DashboardPage.events: return buildEventsContent(events); case DashboardPage.workers: return buildWorkersContent(workers, queues, workersOptions); + case DashboardPage.namespaces: + return buildNamespacesContent( + queues: queues, + workers: workers, + taskStatuses: taskStatuses, + options: namespacesOptions, + defaultNamespace: defaultNamespace, + ); + case DashboardPage.workflows: + return buildWorkflowsContent( + taskStatuses: taskStatuses, + options: workflowsOptions, + ); + case DashboardPage.jobs: + return buildJobsContent(taskStatuses: taskStatuses, options: jobsOptions); } } + +/// Builds inline expandable-row content for task table details. +String buildTaskInlineContent(DashboardTaskStatusEntry? task) { + return buildTaskInlinePanel(task); +} diff --git a/packages/dashboard/lib/src/ui/event_templates.dart b/packages/dashboard/lib/src/ui/event_templates.dart index 013454e..0446c54 100644 --- a/packages/dashboard/lib/src/ui/event_templates.dart +++ b/packages/dashboard/lib/src/ui/event_templates.dart @@ -11,11 +11,11 @@ String renderEventItem(DashboardEvent event) { .map((entry) => '${entry.key}: ${entry.value}') .join(); final summary = event.summary != null && event.summary!.isNotEmpty - ? '

${event.summary}

' + ? '

${event.summary}

' : ''; return ''' -
+
${event.title} $timestamp diff --git a/packages/dashboard/lib/src/ui/events.dart b/packages/dashboard/lib/src/ui/events.dart index 98b0e16..6cbefdc 100644 --- a/packages/dashboard/lib/src/ui/events.dart +++ b/packages/dashboard/lib/src/ui/events.dart @@ -8,8 +8,8 @@ import 'package:stem_dashboard/src/ui/event_templates.dart'; String buildEventsContent(List events) { final items = events.isEmpty ? ''' -
-

No events captured yet

+
+

No events captured yet

Configure the dashboard event bridge to stream Stem signals (enqueue, start, retry, completion) into Redis. Once connected, updates will appear here automatically via Turbo Streams. @@ -19,14 +19,14 @@ String buildEventsContent(List events) { : events.map(renderEventItem).join(); return ''' -