Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,17 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht

## [Unreleased]

## [0.4.2] - 2026-02-25

### Added
- Startup-focused regression tests for duplicate job rendering scenarios in `smon_dashboard`.
- Additional `squeue` duplicate test cases in `slurm_backend` for repeated rows and repeated job IDs.

### Fixed
- Prevent duplicate job rows at startup by deduplicating job IDs before rendering the jobs table.
- Initialize jobs table columns before first data paint to avoid startup render inconsistencies.
- Ignore pre-runtime reactive compact-mode updates that could trigger redundant startup table refreshes.

## [0.4.1] - 2026-02-23

### Added
Expand Down Expand Up @@ -72,7 +83,8 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht
### Added
- Initial `smon` release.

[Unreleased]: https://github.com/RuHae/smon/compare/v0.4.1...HEAD
[Unreleased]: https://github.com/RuHae/smon/compare/v0.4.2...HEAD
[0.4.2]: https://github.com/RuHae/smon/compare/v0.4.1...v0.4.2
[0.4.1]: https://github.com/RuHae/smon/compare/v0.4.0...v0.4.1
[0.4.0]: https://github.com/RuHae/smon/compare/v0.3.0...v0.4.0
[0.3.0]: https://github.com/RuHae/smon/compare/v0.2.0...v0.3.0
Expand Down
8 changes: 7 additions & 1 deletion src/slurm_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def get_job_stats():
return []

jobs_data = []
seen_job_ids: set[str] = set()
lines = output.split("\n")

for line in lines[1:]:
Expand All @@ -117,6 +118,11 @@ def get_job_stats():
if len(parts) < 17:
continue

job_id = parts[0]
if job_id in seen_job_ids:
continue
seen_job_ids.add(job_id)

gpu_count = "-"
gpu_field = parts[8]
try:
Expand All @@ -138,7 +144,7 @@ def get_job_stats():

jobs_data.append(
{
"id": parts[0],
"id": job_id,
"user": parts[1],
"state": parts[2],
"time": parts[3],
Expand Down
24 changes: 23 additions & 1 deletion src/smon_dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,23 @@
}


def _dedupe_jobs_by_id(jobs: list[dict]) -> list[dict]:
deduped: list[dict] = []
seen_job_ids: set[str] = set()

for job in jobs:
job_id = str(job.get("id", ""))
if not job_id:
deduped.append(job)
continue
if job_id in seen_job_ids:
continue
seen_job_ids.add(job_id)
deduped.append(job)

return deduped


def _build_dashboard_css() -> str:
css = """
Screen { layout: vertical; }
Expand Down Expand Up @@ -360,6 +377,9 @@ def on_mount(self) -> None:
node_table.add_columns("Node", "State", "CPU", "Mem", "GPU")
node_table.zebra_stripes = True

# Ensure the jobs table has a stable schema before first render.
self.rebuild_job_columns()

# Focus the configured default pane
if CONFIG.default_pane == "nodes":
self.query_one("#node_table", DataTable).focus()
Expand Down Expand Up @@ -555,6 +575,8 @@ def action_manual_refresh(self):
self.notify("Data refreshed", timeout=1.5)

def watch_show_compact(self, value: bool) -> None:
if not self.is_running:
return
self.rebuild_job_columns()
self.update_data()

Expand Down Expand Up @@ -728,7 +750,7 @@ def update_data(self):
self.last_refresh_time = time.time()

nodes, theo, real = get_cluster_stats()
jobs = get_job_stats()
jobs = _dedupe_jobs_by_id(get_job_stats())
total_jobs = len(jobs)
jobs = self._filter_jobs(jobs)
visible_jobs = len(jobs)
Expand Down
54 changes: 54 additions & 0 deletions tests/test_slurm_backend_job_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,28 @@ def _build_squeue_output() -> str:
return "\n".join([header, *rows])


def _build_squeue_output_with_duplicate_rows() -> str:
base = _build_squeue_output().splitlines()
header = base[0]
rows = base[1:]
# Reproduce production symptom: same job rows repeated in the output.
return "\n".join([header, *rows, *rows])


def _build_squeue_output_with_same_id_different_submit() -> str:
header = (
"JOBID USER STATE TIME LEFT PRIO NODES REASON GRES NAME CPU MEM PART "
"ACCOUNT QOS SUBMIT DEP"
)
rows = [
"100001 user01 PENDING 00:00:00 00:50:00 100 1 (Priority) gres/gpu:h100:4 ruler-dt-de 64 256G all acct01 normal 2026-02-24T13:25:09 (null)",
# Same job repeated with updated runtime/submit rendering from another snapshot.
"100001 user01 PENDING 00:00:00 00:49:59 100 1 (Priority) gres/gpu:h100:4 ruler-dt-de 64 256G all acct01 normal 2026-02-24T13:25:10 (null)",
"100002 user02 PENDING 00:00:00 00:40:00 110 2 (Priority) gres/gpu:h100:1 longbench-dt-de 128 1T all acct02 normal 2026-02-24T13:44:30 (null)",
]
return "\n".join([header, *rows])


def test_get_job_stats_parses_typed_and_untyped_gpu_counts(monkeypatch):
monkeypatch.setattr(slurm_backend, "run_slurm_command", lambda _cmd: _build_squeue_output())

Expand All @@ -29,3 +51,35 @@ def test_get_job_stats_parses_typed_and_untyped_gpu_counts(monkeypatch):
assert jobs_by_id["100003"]["gpu"] == "1"
assert jobs_by_id["100004"]["gpu"] == "64"
assert jobs_by_id["100005"]["gpu"] == "-"


def test_get_job_stats_deduplicates_repeated_rows(monkeypatch):
monkeypatch.setattr(
slurm_backend,
"run_slurm_command",
lambda _cmd: _build_squeue_output_with_duplicate_rows(),
)

jobs = slurm_backend.get_job_stats()

assert len(jobs) == 5
assert [job["id"] for job in jobs] == [
"100001",
"100002",
"100003",
"100004",
"100005",
]


def test_get_job_stats_deduplicates_same_job_id_with_changed_submit(monkeypatch):
monkeypatch.setattr(
slurm_backend,
"run_slurm_command",
lambda _cmd: _build_squeue_output_with_same_id_different_submit(),
)

jobs = slurm_backend.get_job_stats()

assert len(jobs) == 2
assert [job["id"] for job in jobs] == ["100001", "100002"]
28 changes: 28 additions & 0 deletions tests/test_smon_dashboard_job_startup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from __future__ import annotations

from smon_dashboard import SlurmDashboard, _dedupe_jobs_by_id


def test_dedupe_jobs_by_id_removes_repeated_ids_preserving_order():
jobs = [
{"id": "86672", "name": "ruler-dt-de"},
{"id": "86676", "name": "longbench-dt-de"},
{"id": "86672", "name": "ruler-dt-de"},
{"id": "86661", "name": "raim_hybrid_original"},
]

deduped = _dedupe_jobs_by_id(jobs)

assert [job["id"] for job in deduped] == ["86672", "86676", "86661"]


def test_watch_show_compact_is_ignored_before_mount(monkeypatch):
app = SlurmDashboard()

def fail(*_args, **_kwargs):
raise AssertionError("should not run before mount")

monkeypatch.setattr(app, "rebuild_job_columns", fail)
monkeypatch.setattr(app, "update_data", fail)

app.watch_show_compact(True)
Loading