Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions scripts/reinforcement_learning/rsl_rl/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
"--distributed", action="store_true", default=False, help="Run training with multiple GPUs or nodes."
)
parser.add_argument("--export_io_descriptors", action="store_true", default=False, help="Export IO descriptors.")
parser.add_argument("--timer", action="store_true", default=False, help="Enable IsaacLab Timer measurements/output.")
# append RSL-RL cli arguments
cli_args.add_rsl_rl_args(parser)
# append AppLauncher cli args
Expand Down Expand Up @@ -83,8 +84,8 @@

from isaaclab.utils.timer import Timer

Timer.enable = False
Timer.enable_display_output = False
Timer.enable = args_cli.timer
Timer.enable_display_output = args_cli.timer

import isaaclab_tasks_experimental # noqa: F401

Expand Down
70 changes: 60 additions & 10 deletions source/isaaclab/isaaclab/envs/manager_based_rl_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from isaaclab.managers import CommandManager, CurriculumManager, RewardManager, TerminationManager
from isaaclab.ui.widgets import ManagerLiveVisualizer
from isaaclab.utils.timer import Timer

from .common import VecEnvStepReturn
from .manager_based_env import ManagerBasedEnv
Expand Down Expand Up @@ -149,6 +150,7 @@ def setup_manager_visualizers(self):
Operations - MDP
"""

@Timer(name="env_step", msg="Step took:", enable=True, format="us")
def step(self, action: torch.Tensor) -> VecEnvStepReturn:
"""Execute one time-step of the environment's dynamics and reset terminated environments.

Expand All @@ -169,7 +171,14 @@ def step(self, action: torch.Tensor) -> VecEnvStepReturn:
A tuple containing the observations, rewards, resets (terminated and truncated) and extras.
"""
# process actions
self.action_manager.process_action(action.to(self.device))
action_device = action.to(self.device)
with Timer(
name="action_manager.process_action",
msg="ActionManager.process_action took:",
enable=True,
format="us",
):
self.action_manager.process_action(action_device)

self.recorder_manager.record_pre_step()

Expand All @@ -181,11 +190,18 @@ def step(self, action: torch.Tensor) -> VecEnvStepReturn:
for _ in range(self.cfg.decimation):
self._sim_step_counter += 1
# set actions into buffers
self.action_manager.apply_action()
with Timer(
name="action_manager.apply_action",
msg="ActionManager.apply_action took:",
enable=True,
format="us",
):
self.action_manager.apply_action()
# set actions into simulator
self.scene.write_data_to_sim()
# simulate
self.sim.step(render=False)
with Timer(name="simulate", msg="Newton simulation step took:", enable=True, format="us"):
self.sim.step(render=False)
self.recorder_manager.record_post_physics_decimation_step()
# render between steps only if the GUI or an RTX sensor needs it
# note: we assume the render interval to be the shortest accepted rendering interval.
Expand All @@ -199,16 +215,36 @@ def step(self, action: torch.Tensor) -> VecEnvStepReturn:
# -- update env counters (used for curriculum generation)
self.episode_length_buf += 1 # step in current episode (per env)
self.common_step_counter += 1 # total step (common for all envs)

# -- check terminations
self.reset_buf = self.termination_manager.compute()
self.reset_terminated = self.termination_manager.terminated
self.reset_time_outs = self.termination_manager.time_outs
with Timer(
name="termination_manager.compute",
msg="TerminationManager.compute took:",
enable=True,
format="us",
):
self.reset_buf = self.termination_manager.compute()
self.reset_terminated = self.termination_manager.terminated
self.reset_time_outs = self.termination_manager.time_outs

# -- reward computation
self.reward_buf = self.reward_manager.compute(dt=self.step_dt)
with Timer(
name="reward_manager.compute",
msg="RewardManager.compute took:",
enable=True,
format="us",
):
self.reward_buf = self.reward_manager.compute(dt=self.step_dt)

if len(self.recorder_manager.active_terms) > 0:
# update observations for recording if needed
self.obs_buf = self.observation_manager.compute()
with Timer(
name="observation_manager.compute",
msg="ObservationManager.compute took:",
enable=True,
format="us",
):
self.obs_buf = self.observation_manager.compute()
self.recorder_manager.record_post_step()

# -- reset envs that terminated/timed-out and log the episode information
Expand All @@ -228,13 +264,27 @@ def step(self, action: torch.Tensor) -> VecEnvStepReturn:
self.recorder_manager.record_post_reset(reset_env_ids)

# -- update command
self.command_manager.compute(dt=self.step_dt)
with Timer(
name="command_manager.compute",
msg="CommandManager.compute took:",
enable=True,
format="us",
):
self.command_manager.compute(dt=self.step_dt)

# -- step interval events
if "interval" in self.event_manager.available_modes:
self.event_manager.apply(mode="interval", dt=self.step_dt)

# -- compute observations
# note: done after reset to get the correct observations for reset envs
self.obs_buf = self.observation_manager.compute(update_history=True)
with Timer(
name="observation_manager.compute_update_history",
msg="ObservationManager.compute (update_history) took:",
enable=True,
format="us",
):
self.obs_buf = self.observation_manager.compute(update_history=True)

# return observations, rewards, resets and extras
return self.obs_buf, self.reward_buf, self.reset_terminated, self.reset_time_outs, self.extras
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# Manager-based → Warp-first migration plan (experimental)

This doc captures the incremental migration plan to make the **manager-based workflow** (config-driven managers) become **Warp-first and CUDA-graph-friendly**, while keeping the same external behavior/API as the stable manager-based environments.

Scope: start with **Cartpole (manager-based)** as the pilot task.

## Goals

- Preserve the manager-based authoring model (tasks defined via config + MDP terms).
- Keep Gym API behavior the same as the stable manager-based envs.
- Make the core step/reset loop **graph-capturable** (fixed launch topology, persistent buffers, mask-based subset operations).
- Avoid touching stable code by iterating inside `isaaclab_experimental` / `isaaclab_tasks_experimental`.

## Current state (what exists already)

- **Experimental env entry point**: `isaaclab_experimental.envs:ManagerBasedRLEnvWarp`
- **Experimental Cartpole config + mdp**: `isaaclab_tasks_experimental.manager_based.classic.cartpole.*`
- **First manager fork**: `isaaclab_experimental.managers.RewardManager` (Warp-backed buffers / kernels)
- **Action path (Cartpole-minimal)**: `isaaclab_experimental.managers.ActionManager` + `isaaclab_experimental.envs.mdp.actions`
- Warp-first manager boundary (`process_action` consumes `wp.array`; may temporarily accept `torch.Tensor` and convert via `wp.from_torch`)
- Mask-based reset API (preferred for capture): `reset(mask: wp.array | torch.Tensor | None)`

## Phased migration (minimal + incremental)

### Phase 0 — Baseline experimental entry points (no behavior change)

What:
- Register new Gym IDs that point at experimental env entry points.
- Keep task configs stable unless explicitly copied for isolation.

Why:
- Allows iteration without breaking stable tasks.

Deliverables:
- `isaaclab_experimental.envs:ManagerBasedRLEnvWarp`
- `Isaac-…-v0` IDs under `isaaclab_tasks_experimental`

### Phase 1 — Term-level Warp (keep Python managers, keep Torch-facing API)

What:
- Introduce Warp implementations for *select* MDP terms (Cartpole rewards/obs/events) while keeping:
- manager orchestration in Python
- env returns (`obs`, `rew`, `terminated`, `truncated`) as **torch.Tensor**
- Use `wp.array` buffers internally and expose `torch` views via `wp.to_torch(...)` at boundaries.

Why:
- Lets you validate Warp math + data plumbing without rewriting the entire manager framework.

Typical changes:
- Add `out` buffers to term cfgs (or manager-owned persistent outputs).
- Convert term functions from “return torch” → “write into wp.array”.

Cartpole focus:
- Rewards: pole angle term, alive/terminated terms, etc.
- Observations: joint pos/vel relative
- Events: reset by offset (mask-based subset)

### Phase 2 — Manager-level Warp buffers (still Python-loop scheduling)

What:
- Keep manager iteration in Python, but move all per-env buffers to Warp:
- reward accumulation buffers
- termination buffers
- (optionally) action/observation buffers
- Replace torch ops like `nonzero()`, `torch.mean(...)`, per-term tensor math with Warp kernels.

Why:
- Removes Torch from the hot-path while keeping the overall structure intact.

Deliverables (pilot):
- Warp-backed `RewardManager` (done/ongoing)
- Warp-backed `ActionManager` (Cartpole-minimal; mask-based reset; optional Torch shim)
- Next candidates: `TerminationManager`, `EventManager` (mask-based reset/interval), `ObservationManager`

Notes (graph/capture):
- `wp.from_torch(...)` creates a lightweight Warp wrapper around the Torch tensor memory, but you still pay Python-side overhead per call.
For CUDA graph capture, prefer **persistent buffers** (stable pointers) and update them in-place, then pass the persistent `wp.array`
through the manager boundary. This is the same caveat noted in `DirectRLEnvWarp.step`.

Notes (Torch → Warp porting):
- Torch implementations often “broadcast” per-joint constants into `(num_envs, action_dim)` tensors for convenience.
In Warp-first ports, prefer keeping these as **constant per-joint buffers** (e.g. `(action_dim,)` for `scale/offset`,
`(action_dim, 2)` for `clip`) and index by `j` inside kernels. This avoids redundant per-env storage and extra broadcast kernels,
while preserving behavior.

Notes (Warp CUDA graph capture in manager-based env):
- Partition the env step into **small stage functions** that only touch persistent CUDA buffers, then capture/replay them with Warp:
- `step_warp_action_process(...)`: `ActionManager.process_action` (env-step)
- `step_warp_action_apply(...)`: `ActionManager.apply_action` + `scene.write_data_to_sim` (sim-step)
- `step_warp_reward_compute(dt)`: `RewardManager.compute(dt)` (env-step)
- Use a helper like `capture_or_launch(fn, *args, **kwargs)` keyed by `fn.__name__` to standardize:
“if first time: `wp.ScopedCapture()`; else: `wp.capture_launch(graph)`”.
- Any captured stage that reads inputs must read from **stable pointers**:
e.g. keep a persistent `wp.array` action input buffer and copy incoming actions into it each step.
- If the launch topology changes (term list / shapes / enabling debug-vis, etc.), invalidate cached graphs and recapture.

### Phase 3 — Dependency surfacing and hybrid handling

What:
- Identify and isolate subsystems that still create Torch buffers internally (common examples: contact sensors, some recorders).
- For each dependency:
- either keep as Torch “edge” (temporarily), or
- create Warp-first equivalents / alternate codepaths

Why:
- Some dependencies are not purely “MDP math” and need dedicated rewrites for graphability.

### Phase 4 — Graph-friendly orchestrator rewrite (fixed topology + masks)

What:
- Replace the dynamic parts of the env `step()`/`reset()` control flow:
- eliminate dynamic indexing patterns (e.g., `nonzero()` → env-id lists)
- use **boolean masks** (`wp.array(dtype=wp.bool)`) and kernels that apply to subsets
- ensure persistent buffers are allocated once and reused
- ensure launch order is stable and capture-ready

Why:
- CUDA graph capture requires stable execution topology.

Key design rules:
- **No per-step Python branching on data-dependent indices** (or keep it outside capture).
- Prefer `mask`-based APIs where possible (e.g., scene reset supports mask).
- Maintain one-time allocations; no shape changes.

### Phase 5 — Cleanup + consolidation

What:
- Remove transitional Torch shims and duplication where no longer needed.
- Optionally add a stable public entry point once the experimental path is validated.

## Practical “copy vs reuse” policy

- **Copy** into experimental when you expect semantic changes (Cartpole config/mdp, selected managers).
- **Reuse** stable implementations for everything else until it becomes a blocker.
- Prefer one fork at a time (e.g., start with `RewardManager`, then termination, then events).

## Suggested next steps (Cartpole)

- Keep Cartpole task config isolated under `isaaclab_tasks_experimental`.
- Continue stabilizing the experimental `RewardManager` interface (decide: term returns vs term writes).
- Add the next minimal manager fork: `TerminationManager` using Warp buffers (still return torch views).
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,12 @@
.. _`Task Design Workflows`: https://isaac-sim.github.io/IsaacLab/source/features/task_workflows.html
"""

from .direct_rl_env_warp import DirectRLEnvWarp
from .direct_rl_env_warp import DirectRLEnvWarp # noqa: F401
from .manager_based_env_warp import ManagerBasedEnvWarp # noqa: F401
from .manager_based_rl_env_warp import ManagerBasedRLEnvWarp # noqa: F401

__all__ = [
"DirectRLEnvWarp",
"ManagerBasedEnvWarp",
"ManagerBasedRLEnvWarp",
]
Loading