From f4a742bc227e6dcf8d010cc017ad614dd069d097 Mon Sep 17 00:00:00 2001 From: Maurice Kraus Date: Wed, 25 Feb 2026 09:05:22 +0000 Subject: [PATCH 1/3] fix: deduplicate repeated jobs from squeue output --- src/slurm_backend.py | 6 ++++++ tests/test_slurm_backend_job_parsing.py | 27 +++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/src/slurm_backend.py b/src/slurm_backend.py index 585161d..1f3fb3d 100644 --- a/src/slurm_backend.py +++ b/src/slurm_backend.py @@ -108,6 +108,7 @@ def get_job_stats(): return [] jobs_data = [] + seen_jobs: set[tuple[str, str]] = set() lines = output.split("\n") for line in lines[1:]: @@ -117,6 +118,11 @@ def get_job_stats(): if len(parts) < 17: continue + job_key = (parts[0], parts[15]) # (job id, submit timestamp) + if job_key in seen_jobs: + continue + seen_jobs.add(job_key) + gpu_count = "-" gpu_field = parts[8] try: diff --git a/tests/test_slurm_backend_job_parsing.py b/tests/test_slurm_backend_job_parsing.py index 0589d52..7316627 100644 --- a/tests/test_slurm_backend_job_parsing.py +++ b/tests/test_slurm_backend_job_parsing.py @@ -18,6 +18,14 @@ def _build_squeue_output() -> str: return "\n".join([header, *rows]) +def _build_squeue_output_with_duplicate_rows() -> str: + base = _build_squeue_output().splitlines() + header = base[0] + rows = base[1:] + # Reproduce production symptom: same job rows repeated in the output. + return "\n".join([header, *rows, *rows]) + + def test_get_job_stats_parses_typed_and_untyped_gpu_counts(monkeypatch): monkeypatch.setattr(slurm_backend, "run_slurm_command", lambda _cmd: _build_squeue_output()) @@ -29,3 +37,22 @@ def test_get_job_stats_parses_typed_and_untyped_gpu_counts(monkeypatch): assert jobs_by_id["100003"]["gpu"] == "1" assert jobs_by_id["100004"]["gpu"] == "64" assert jobs_by_id["100005"]["gpu"] == "-" + + +def test_get_job_stats_deduplicates_repeated_rows(monkeypatch): + monkeypatch.setattr( + slurm_backend, + "run_slurm_command", + lambda _cmd: _build_squeue_output_with_duplicate_rows(), + ) + + jobs = slurm_backend.get_job_stats() + + assert len(jobs) == 5 + assert [job["id"] for job in jobs] == [ + "100001", + "100002", + "100003", + "100004", + "100005", + ] From 5f3deb77fa51122c4b9fbb82b7241749a08200a1 Mon Sep 17 00:00:00 2001 From: Maurice Kraus Date: Wed, 25 Feb 2026 09:18:03 +0000 Subject: [PATCH 2/3] fix: prevent startup duplicate job rows --- src/slurm_backend.py | 10 ++++----- src/smon_dashboard.py | 24 +++++++++++++++++++- tests/test_slurm_backend_job_parsing.py | 27 +++++++++++++++++++++++ tests/test_smon_dashboard_job_startup.py | 28 ++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 6 deletions(-) create mode 100644 tests/test_smon_dashboard_job_startup.py diff --git a/src/slurm_backend.py b/src/slurm_backend.py index 1f3fb3d..49e0f66 100644 --- a/src/slurm_backend.py +++ b/src/slurm_backend.py @@ -108,7 +108,7 @@ def get_job_stats(): return [] jobs_data = [] - seen_jobs: set[tuple[str, str]] = set() + seen_job_ids: set[str] = set() lines = output.split("\n") for line in lines[1:]: @@ -118,10 +118,10 @@ def get_job_stats(): if len(parts) < 17: continue - job_key = (parts[0], parts[15]) # (job id, submit timestamp) - if job_key in seen_jobs: + job_id = parts[0] + if job_id in seen_job_ids: continue - seen_jobs.add(job_key) + seen_job_ids.add(job_id) gpu_count = "-" gpu_field = parts[8] @@ -144,7 +144,7 @@ def get_job_stats(): jobs_data.append( { - "id": parts[0], + "id": job_id, "user": parts[1], "state": parts[2], "time": parts[3], diff --git a/src/smon_dashboard.py b/src/smon_dashboard.py index 9833bfe..662199a 100644 --- a/src/smon_dashboard.py +++ b/src/smon_dashboard.py @@ -49,6 +49,23 @@ } +def _dedupe_jobs_by_id(jobs: list[dict]) -> list[dict]: + deduped: list[dict] = [] + seen_job_ids: set[str] = set() + + for job in jobs: + job_id = str(job.get("id", "")) + if not job_id: + deduped.append(job) + continue + if job_id in seen_job_ids: + continue + seen_job_ids.add(job_id) + deduped.append(job) + + return deduped + + def _build_dashboard_css() -> str: css = """ Screen { layout: vertical; } @@ -360,6 +377,9 @@ def on_mount(self) -> None: node_table.add_columns("Node", "State", "CPU", "Mem", "GPU") node_table.zebra_stripes = True + # Ensure the jobs table has a stable schema before first render. + self.rebuild_job_columns() + # Focus the configured default pane if CONFIG.default_pane == "nodes": self.query_one("#node_table", DataTable).focus() @@ -555,6 +575,8 @@ def action_manual_refresh(self): self.notify("Data refreshed", timeout=1.5) def watch_show_compact(self, value: bool) -> None: + if not self.is_running: + return self.rebuild_job_columns() self.update_data() @@ -728,7 +750,7 @@ def update_data(self): self.last_refresh_time = time.time() nodes, theo, real = get_cluster_stats() - jobs = get_job_stats() + jobs = _dedupe_jobs_by_id(get_job_stats()) total_jobs = len(jobs) jobs = self._filter_jobs(jobs) visible_jobs = len(jobs) diff --git a/tests/test_slurm_backend_job_parsing.py b/tests/test_slurm_backend_job_parsing.py index 7316627..bacffbf 100644 --- a/tests/test_slurm_backend_job_parsing.py +++ b/tests/test_slurm_backend_job_parsing.py @@ -26,6 +26,20 @@ def _build_squeue_output_with_duplicate_rows() -> str: return "\n".join([header, *rows, *rows]) +def _build_squeue_output_with_same_id_different_submit() -> str: + header = ( + "JOBID USER STATE TIME LEFT PRIO NODES REASON GRES NAME CPU MEM PART " + "ACCOUNT QOS SUBMIT DEP" + ) + rows = [ + "100001 user01 PENDING 00:00:00 00:50:00 100 1 (Priority) gres/gpu:h100:4 ruler-dt-de 64 256G all acct01 normal 2026-02-24T13:25:09 (null)", + # Same job repeated with updated runtime/submit rendering from another snapshot. + "100001 user01 PENDING 00:00:00 00:49:59 100 1 (Priority) gres/gpu:h100:4 ruler-dt-de 64 256G all acct01 normal 2026-02-24T13:25:10 (null)", + "100002 user02 PENDING 00:00:00 00:40:00 110 2 (Priority) gres/gpu:h100:1 longbench-dt-de 128 1T all acct02 normal 2026-02-24T13:44:30 (null)", + ] + return "\n".join([header, *rows]) + + def test_get_job_stats_parses_typed_and_untyped_gpu_counts(monkeypatch): monkeypatch.setattr(slurm_backend, "run_slurm_command", lambda _cmd: _build_squeue_output()) @@ -56,3 +70,16 @@ def test_get_job_stats_deduplicates_repeated_rows(monkeypatch): "100004", "100005", ] + + +def test_get_job_stats_deduplicates_same_job_id_with_changed_submit(monkeypatch): + monkeypatch.setattr( + slurm_backend, + "run_slurm_command", + lambda _cmd: _build_squeue_output_with_same_id_different_submit(), + ) + + jobs = slurm_backend.get_job_stats() + + assert len(jobs) == 2 + assert [job["id"] for job in jobs] == ["100001", "100002"] diff --git a/tests/test_smon_dashboard_job_startup.py b/tests/test_smon_dashboard_job_startup.py new file mode 100644 index 0000000..86cf84c --- /dev/null +++ b/tests/test_smon_dashboard_job_startup.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from smon_dashboard import SlurmDashboard, _dedupe_jobs_by_id + + +def test_dedupe_jobs_by_id_removes_repeated_ids_preserving_order(): + jobs = [ + {"id": "86672", "name": "ruler-dt-de"}, + {"id": "86676", "name": "longbench-dt-de"}, + {"id": "86672", "name": "ruler-dt-de"}, + {"id": "86661", "name": "raim_hybrid_original"}, + ] + + deduped = _dedupe_jobs_by_id(jobs) + + assert [job["id"] for job in deduped] == ["86672", "86676", "86661"] + + +def test_watch_show_compact_is_ignored_before_mount(monkeypatch): + app = SlurmDashboard() + + def fail(*_args, **_kwargs): + raise AssertionError("should not run before mount") + + monkeypatch.setattr(app, "rebuild_job_columns", fail) + monkeypatch.setattr(app, "update_data", fail) + + app.watch_show_compact(True) From f5bb7d679a0d46408b531c9186755b8c8a274d14 Mon Sep 17 00:00:00 2001 From: Maurice Kraus Date: Wed, 25 Feb 2026 09:18:30 +0000 Subject: [PATCH 3/3] chore(release): prepare 0.4.2 hotfix changelog --- CHANGELOG.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2dfeaa6..656e750 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,17 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht ## [Unreleased] +## [0.4.2] - 2026-02-25 + +### Added +- Startup-focused regression tests for duplicate job rendering scenarios in `smon_dashboard`. +- Additional `squeue` duplicate test cases in `slurm_backend` for repeated rows and repeated job IDs. + +### Fixed +- Prevent duplicate job rows at startup by deduplicating job IDs before rendering the jobs table. +- Initialize jobs table columns before first data paint to avoid startup render inconsistencies. +- Ignore pre-runtime reactive compact-mode updates that could trigger redundant startup table refreshes. + ## [0.4.1] - 2026-02-23 ### Added @@ -72,7 +83,8 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht ### Added - Initial `smon` release. -[Unreleased]: https://github.com/RuHae/smon/compare/v0.4.1...HEAD +[Unreleased]: https://github.com/RuHae/smon/compare/v0.4.2...HEAD +[0.4.2]: https://github.com/RuHae/smon/compare/v0.4.1...v0.4.2 [0.4.1]: https://github.com/RuHae/smon/compare/v0.4.0...v0.4.1 [0.4.0]: https://github.com/RuHae/smon/compare/v0.3.0...v0.4.0 [0.3.0]: https://github.com/RuHae/smon/compare/v0.2.0...v0.3.0