diff --git a/scripts/benchmarks/benchmark_non_rl.py b/scripts/benchmarks/benchmark_non_rl.py index 9eef653455c..e7eeb24fe5a 100644 --- a/scripts/benchmarks/benchmark_non_rl.py +++ b/scripts/benchmarks/benchmark_non_rl.py @@ -12,10 +12,8 @@ import sys import time -from isaaclab.app import AppLauncher - # add argparse arguments -parser = argparse.ArgumentParser(description="Train an RL agent with RL-Games.") +parser = argparse.ArgumentParser(description="Benchmark non-RL environment.") parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.") parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).") parser.add_argument("--video_interval", type=int, default=2000, help="Interval between video recordings (in steps).") @@ -34,11 +32,22 @@ help="Benchmarking backend options, defaults OmniPerfKPIFile", ) parser.add_argument("--output_folder", type=str, default=None, help="Output folder for the benchmark.") +parser.add_argument( + "--kit", + action="store_true", + default=False, + help="Enable Isaac Sim Kit and use isaacsim.benchmark.services. Default: False (uses standalone benchmark).", +) + +# Conditionally add AppLauncher args only if --kit is enabled +if "--kit" in sys.argv: + from isaaclab.app import AppLauncher + + AppLauncher.add_app_launcher_args(parser) -# append AppLauncher cli args -AppLauncher.add_app_launcher_args(parser) # parse the arguments args_cli, hydra_args = parser.parse_known_args() + # always enable cameras to record video if args_cli.video: args_cli.enable_cameras = True @@ -48,49 +57,68 @@ app_start_time_begin = time.perf_counter_ns() -# launch omniverse app -app_launcher = AppLauncher(args_cli) -simulation_app = app_launcher.app +# Conditionally launch Isaac Sim +simulation_app = None +app_launcher = None +if args_cli.kit: + # Force Omniverse mode by setting environment variable + # This ensures SimulationApp is launched even without explicit visualizers + os.environ["LAUNCH_OV_APP"] = "1" + + # launch omniverse app + app_launcher = AppLauncher(args_cli) + simulation_app = app_launcher.app app_start_time_end = time.perf_counter_ns() """Rest everything follows.""" -# enable benchmarking extension -from isaacsim.core.utils.extensions import enable_extension +sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../..")) -enable_extension("isaacsim.benchmark.services") +# Import benchmark infrastructure based on kit flag +if args_cli.kit: + # enable benchmarking extension + from isaacsim.core.utils.extensions import enable_extension -# Set the benchmark settings according to the inputs -import carb + enable_extension("isaacsim.benchmark.services") -settings = carb.settings.get_settings() -settings.set("/exts/isaacsim.benchmark.services/metrics/metrics_output_folder", args_cli.output_folder) -settings.set("/exts/isaacsim.benchmark.services/metrics/randomize_filename_prefix", True) + # Set the benchmark settings according to the inputs + import carb -from isaacsim.benchmark.services import BaseIsaacBenchmark + settings = carb.settings.get_settings() + settings.set("/exts/isaacsim.benchmark.services/metrics/metrics_output_folder", args_cli.output_folder) + settings.set("/exts/isaacsim.benchmark.services/metrics/randomize_filename_prefix", True) -sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../..")) + from isaacsim.benchmark.services import BaseIsaacBenchmark -from isaaclab.utils.timer import Timer -from scripts.benchmarks.utils import ( - get_isaaclab_version, - get_mujoco_warp_version, - get_newton_version, - log_app_start_time, - log_python_imports_time, - log_runtime_step_times, - log_scene_creation_time, - log_simulation_start_time, - log_task_start_time, - log_total_start_time, -) + from scripts.benchmarks.utils.benchmark_utils import create_kit_logging_functions, get_timer_value + + # Get all logging functions for kit mode + log_funcs = create_kit_logging_functions() +else: + # Use standalone benchmark services + from scripts.benchmarks.utils.benchmark_utils import create_standalone_logging_functions, get_timer_value + from scripts.benchmarks.utils.standalone_benchmark import StandaloneBenchmark + + # Get all logging functions for standalone mode + log_funcs = create_standalone_logging_functions() + +# Extract individual functions from the dictionary for easier use +get_isaaclab_version = log_funcs["get_isaaclab_version"] +get_mujoco_warp_version = log_funcs["get_mujoco_warp_version"] +get_newton_version = log_funcs["get_newton_version"] +log_app_start_time = log_funcs["log_app_start_time"] +log_python_imports_time = log_funcs["log_python_imports_time"] +log_task_start_time = log_funcs["log_task_start_time"] +log_scene_creation_time = log_funcs["log_scene_creation_time"] +log_simulation_start_time = log_funcs["log_simulation_start_time"] +log_total_start_time = log_funcs["log_total_start_time"] +log_runtime_step_times = log_funcs["log_runtime_step_times"] imports_time_begin = time.perf_counter_ns() import gymnasium as gym import numpy as np -import os import torch from datetime import datetime @@ -104,21 +132,40 @@ # Create the benchmark -benchmark = BaseIsaacBenchmark( - benchmark_name="benchmark_non_rl", - workflow_metadata={ - "metadata": [ - {"name": "task", "data": args_cli.task}, - {"name": "seed", "data": args_cli.seed}, - {"name": "num_envs", "data": args_cli.num_envs}, - {"name": "num_frames", "data": args_cli.num_frames}, - {"name": "Mujoco Warp Info", "data": get_mujoco_warp_version()}, - {"name": "Isaac Lab Info", "data": get_isaaclab_version()}, - {"name": "Newton Info", "data": get_newton_version()}, - ] - }, - backend_type=args_cli.benchmark_backend, -) +if args_cli.kit: + benchmark = BaseIsaacBenchmark( + benchmark_name="benchmark_non_rl", + workflow_metadata={ + "metadata": [ + {"name": "task", "data": args_cli.task}, + {"name": "seed", "data": args_cli.seed}, + {"name": "num_envs", "data": args_cli.num_envs}, + {"name": "num_frames", "data": args_cli.num_frames}, + {"name": "Mujoco Warp Info", "data": get_mujoco_warp_version()}, + {"name": "Isaac Lab Info", "data": get_isaaclab_version()}, + {"name": "Newton Info", "data": get_newton_version()}, + ] + }, + backend_type=args_cli.benchmark_backend, + ) +else: + benchmark = StandaloneBenchmark( + benchmark_name="benchmark_non_rl", + workflow_metadata={ + "metadata": [ + {"name": "task", "data": args_cli.task}, + {"name": "seed", "data": args_cli.seed}, + {"name": "num_envs", "data": args_cli.num_envs}, + {"name": "num_frames", "data": args_cli.num_frames}, + {"name": "Mujoco Warp Info", "data": get_mujoco_warp_version()}, + {"name": "Isaac Lab Info", "data": get_isaaclab_version()}, + {"name": "Newton Info", "data": get_newton_version()}, + ] + }, + backend_type=args_cli.benchmark_backend, + output_folder=args_cli.output_folder, + randomize_filename_prefix=True, + ) @hydra_task_config(args_cli.task, None) @@ -127,15 +174,19 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg, agent_cfg: dict): # override configurations with non-hydra CLI arguments env_cfg.scene.num_envs = args_cli.num_envs if args_cli.num_envs is not None else env_cfg.scene.num_envs - env_cfg.sim.device = args_cli.device if args_cli.device is not None else env_cfg.sim.device + if args_cli.kit and hasattr(args_cli, "device") and args_cli.device is not None: + env_cfg.sim.device = args_cli.device # process distributed world_size = 1 world_rank = 0 if args_cli.distributed: - env_cfg.sim.device = f"cuda:{app_launcher.local_rank}" - world_size = int(os.getenv("WORLD_SIZE", 1)) - world_rank = app_launcher.global_rank + if args_cli.kit: + env_cfg.sim.device = f"cuda:{app_launcher.local_rank}" + world_size = int(os.getenv("WORLD_SIZE", 1)) + world_rank = app_launcher.global_rank + else: + print("[WARNING] Distributed mode is only supported with --kit flag.") task_startup_time_begin = time.perf_counter_ns() @@ -143,7 +194,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg, agent_cfg: dict): env = gym.make(args_cli.task, cfg=env_cfg, render_mode="rgb_array" if args_cli.video else None) # wrap for video recording if args_cli.video: - log_root_path = os.path.abs(f"benchmark/{args_cli.task}") + log_root_path = os.path.abspath(f"benchmark/{args_cli.task}") log_dir = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") video_kwargs = { "video_folder": os.path.join(log_root_path, log_dir, "videos"), @@ -165,7 +216,30 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg, agent_cfg: dict): num_frames = 0 # log frame times step_times = [] - while simulation_app.is_running(): + + # Run loop depends on whether we're using kit or not + if args_cli.kit: + while simulation_app.is_running(): + while num_frames < args_cli.num_frames: + # get upper and lower bounds of action space, sample actions randomly on this interval + action_high = 1 + action_low = -1 + actions = (action_high - action_low) * torch.rand( + env.unwrapped.num_envs, env.unwrapped.single_action_space.shape[0], device=env.unwrapped.device + ) - action_high + + # env stepping + env_step_time_begin = time.perf_counter_ns() + _ = env.step(actions) + end_step_time_end = time.perf_counter_ns() + step_times.append(end_step_time_end - env_step_time_begin) + + num_frames += 1 + + # terminate + break + else: + # Standalone mode - simple loop while num_frames < args_cli.num_frames: # get upper and lower bounds of action space, sample actions randomly on this interval action_high = 1 @@ -182,9 +256,6 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg, agent_cfg: dict): num_frames += 1 - # terminate - break - if world_rank == 0: benchmark.store_measurements() @@ -203,8 +274,13 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg, agent_cfg: dict): log_app_start_time(benchmark, (app_start_time_end - app_start_time_begin) / 1e6) log_python_imports_time(benchmark, (imports_time_end - imports_time_begin) / 1e6) log_task_start_time(benchmark, (task_startup_time_end - task_startup_time_begin) / 1e6) - log_scene_creation_time(benchmark, Timer.get_timer_info("scene_creation") * 1000) - log_simulation_start_time(benchmark, Timer.get_timer_info("simulation_start") * 1000) + + # Timer may not be available in standalone mode + scene_creation_time = get_timer_value("scene_creation") + simulation_start_time = get_timer_value("simulation_start") + + log_scene_creation_time(benchmark, scene_creation_time * 1000 if scene_creation_time else None) + log_simulation_start_time(benchmark, simulation_start_time * 1000 if simulation_start_time else None) log_total_start_time(benchmark, (task_startup_time_end - app_start_time_begin) / 1e6) log_runtime_step_times(benchmark, environment_step_times, compute_stats=True) @@ -218,4 +294,5 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg, agent_cfg: dict): # run the main function main() # close sim app - simulation_app.close() + if simulation_app is not None: + simulation_app.close() diff --git a/scripts/benchmarks/benchmark_rlgames.py b/scripts/benchmarks/benchmark_rlgames.py index e399503e014..e788cf90a39 100644 --- a/scripts/benchmarks/benchmark_rlgames.py +++ b/scripts/benchmarks/benchmark_rlgames.py @@ -8,13 +8,12 @@ """Launch Isaac Sim Simulator first.""" import argparse +import os import sys import time -from isaaclab.app import AppLauncher - # add argparse arguments -parser = argparse.ArgumentParser(description="Train an RL agent with RL-Games.") +parser = argparse.ArgumentParser(description="Benchmark RL agent with RL-Games.") parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.") parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).") parser.add_argument("--video_interval", type=int, default=2000, help="Interval between video recordings (in steps).") @@ -32,11 +31,23 @@ choices=["LocalLogMetrics", "JSONFileMetrics", "OsmoKPIFile", "OmniPerfKPIFile"], help="Benchmarking backend options, defaults OmniPerfKPIFile", ) +parser.add_argument("--output_folder", type=str, default=None, help="Output folder for the benchmark.") +parser.add_argument( + "--kit", + action="store_true", + default=False, + help="Enable Isaac Sim Kit and use isaacsim.benchmark.services. Default: False (uses standalone benchmark).", +) + +# Conditionally add AppLauncher args only if --kit is enabled +if "--kit" in sys.argv: + from isaaclab.app import AppLauncher + + AppLauncher.add_app_launcher_args(parser) -# append AppLauncher cli args -AppLauncher.add_app_launcher_args(parser) # parse the arguments args_cli, hydra_args = parser.parse_known_args() + # always enable cameras to record video if args_cli.video: args_cli.enable_cameras = True @@ -46,19 +57,56 @@ app_start_time_begin = time.perf_counter_ns() -# launch omniverse app -app_launcher = AppLauncher(args_cli) -simulation_app = app_launcher.app +# Conditionally launch Isaac Sim +simulation_app = None +app_launcher = None +if args_cli.kit: + # Force Omniverse mode by setting environment variable + # This ensures SimulationApp is launched even without explicit visualizers + os.environ["LAUNCH_OV_APP"] = "1" + + # launch omniverse app + app_launcher = AppLauncher(args_cli) + simulation_app = app_launcher.app app_start_time_end = time.perf_counter_ns() """Rest everything follows.""" -# enable benchmarking extension -from isaacsim.core.utils.extensions import enable_extension - -enable_extension("isaacsim.benchmark.services") -from isaacsim.benchmark.services import BaseIsaacBenchmark +# Import benchmark infrastructure based on kit flag +if args_cli.kit: + # enable benchmarking extension + from isaacsim.core.utils.extensions import enable_extension + + enable_extension("isaacsim.benchmark.services") + from isaacsim.benchmark.services import BaseIsaacBenchmark + + from scripts.benchmarks.utils.benchmark_utils import create_kit_logging_functions, get_timer_value + + # Get all logging functions for kit mode + log_funcs = create_kit_logging_functions() +else: + # Use standalone benchmark services + from scripts.benchmarks.utils.benchmark_utils import create_standalone_logging_functions, get_timer_value + from scripts.benchmarks.utils.standalone_benchmark import StandaloneBenchmark + + # Get all logging functions for standalone mode + log_funcs = create_standalone_logging_functions() + +# Extract individual functions from the dictionary for easier use +get_isaaclab_version = log_funcs["get_isaaclab_version"] +get_mujoco_warp_version = log_funcs["get_mujoco_warp_version"] +get_newton_version = log_funcs["get_newton_version"] +log_app_start_time = log_funcs["log_app_start_time"] +log_python_imports_time = log_funcs["log_python_imports_time"] +log_task_start_time = log_funcs["log_task_start_time"] +log_scene_creation_time = log_funcs["log_scene_creation_time"] +log_simulation_start_time = log_funcs["log_simulation_start_time"] +log_total_start_time = log_funcs["log_total_start_time"] +log_runtime_step_times = log_funcs["log_runtime_step_times"] +log_rl_policy_rewards = log_funcs["log_rl_policy_rewards"] +log_rl_policy_episode_lengths = log_funcs["log_rl_policy_episode_lengths"] +parse_tf_logs = log_funcs["parse_tf_logs"] imports_time_begin = time.perf_counter_ns() @@ -84,22 +132,6 @@ imports_time_end = time.perf_counter_ns() -sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../..")) - -from isaaclab.utils.timer import Timer -from scripts.benchmarks.utils import ( - log_app_start_time, - log_python_imports_time, - log_rl_policy_episode_lengths, - log_rl_policy_rewards, - log_runtime_step_times, - log_scene_creation_time, - log_simulation_start_time, - log_task_start_time, - log_total_start_time, - parse_tf_logs, -) - torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True torch.backends.cudnn.deterministic = False @@ -107,18 +139,40 @@ # Create the benchmark -benchmark = BaseIsaacBenchmark( - benchmark_name="benchmark_rlgames_train", - workflow_metadata={ - "metadata": [ - {"name": "task", "data": args_cli.task}, - {"name": "seed", "data": args_cli.seed}, - {"name": "num_envs", "data": args_cli.num_envs}, - {"name": "max_iterations", "data": args_cli.max_iterations}, - ] - }, - backend_type=args_cli.benchmark_backend, -) +if args_cli.kit: + benchmark = BaseIsaacBenchmark( + benchmark_name="benchmark_rlgames_train", + workflow_metadata={ + "metadata": [ + {"name": "task", "data": args_cli.task}, + {"name": "seed", "data": args_cli.seed}, + {"name": "num_envs", "data": args_cli.num_envs}, + {"name": "max_iterations", "data": args_cli.max_iterations}, + {"name": "Mujoco Warp Info", "data": get_mujoco_warp_version()}, + {"name": "Isaac Lab Info", "data": get_isaaclab_version()}, + {"name": "Newton Info", "data": get_newton_version()}, + ] + }, + backend_type=args_cli.benchmark_backend, + ) +else: + benchmark = StandaloneBenchmark( + benchmark_name="benchmark_rlgames_train", + workflow_metadata={ + "metadata": [ + {"name": "task", "data": args_cli.task}, + {"name": "seed", "data": args_cli.seed}, + {"name": "num_envs", "data": args_cli.num_envs}, + {"name": "max_iterations", "data": args_cli.max_iterations}, + {"name": "Mujoco Warp Info", "data": get_mujoco_warp_version()}, + {"name": "Isaac Lab Info", "data": get_isaaclab_version()}, + {"name": "Newton Info", "data": get_newton_version()}, + ] + }, + backend_type=args_cli.benchmark_backend, + output_folder=args_cli.output_folder, + randomize_filename_prefix=True, + ) @hydra_task_config(args_cli.task, "rl_games_cfg_entry_point") @@ -127,7 +181,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg, agent_cfg: dict): # override configurations with non-hydra CLI arguments env_cfg.scene.num_envs = args_cli.num_envs if args_cli.num_envs is not None else env_cfg.scene.num_envs - env_cfg.sim.device = args_cli.device if args_cli.device is not None else env_cfg.sim.device + if args_cli.kit and hasattr(args_cli, "device") and args_cli.device is not None: + env_cfg.sim.device = args_cli.device # randomly sample a seed if seed = -1 if args_cli.seed == -1: @@ -137,9 +192,12 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg, agent_cfg: dict): # process distributed world_rank = 0 if args_cli.distributed: - env_cfg.sim.device = f"cuda:{app_launcher.local_rank}" - agent_cfg["params"]["config"]["device"] = f"cuda:{app_launcher.local_rank}" - world_rank = app_launcher.global_rank + if args_cli.kit: + env_cfg.sim.device = f"cuda:{app_launcher.local_rank}" + agent_cfg["params"]["config"]["device"] = f"cuda:{app_launcher.local_rank}" + world_rank = app_launcher.global_rank + else: + print("[WARNING] Distributed mode is only supported with --kit flag.") # specify directory for logging experiments log_root_path = os.path.join("logs", "rl_games", agent_cfg["params"]["config"]["name"]) @@ -153,7 +211,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg, agent_cfg: dict): agent_cfg["params"]["config"]["full_experiment_name"] = log_dir # multi-gpu training config - if args_cli.distributed: + if args_cli.distributed and args_cli.kit: agent_cfg["params"]["seed"] += app_launcher.global_rank agent_cfg["params"]["config"]["device"] = f"cuda:{app_launcher.local_rank}" agent_cfg["params"]["config"]["device_name"] = f"cuda:{app_launcher.local_rank}" @@ -229,24 +287,29 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg, agent_cfg: dict): # prepare RL timing dict rl_training_times = { - "Environment only step time": log_data["performance/step_time"], - "Environment + Inference step time": log_data["performance/step_inference_time"], - "Environment + Inference + Policy update time": log_data["performance/rl_update_time"], - "Environment only FPS": log_data["performance/step_fps"], - "Environment + Inference FPS": log_data["performance/step_inference_fps"], - "Environment + Inference + Policy update FPS": log_data["performance/step_inference_rl_update_fps"], + "Environment only step time": log_data.get("performance/step_time", []), + "Environment + Inference step time": log_data.get("performance/step_inference_time", []), + "Environment + Inference + Policy update time": log_data.get("performance/rl_update_time", []), + "Environment only FPS": log_data.get("performance/step_fps", []), + "Environment + Inference FPS": log_data.get("performance/step_inference_fps", []), + "Environment + Inference + Policy update FPS": log_data.get("performance/step_inference_rl_update_fps", []), } # log additional metrics to benchmark services log_app_start_time(benchmark, (app_start_time_end - app_start_time_begin) / 1e6) log_python_imports_time(benchmark, (imports_time_end - imports_time_begin) / 1e6) log_task_start_time(benchmark, (task_startup_time_end - task_startup_time_begin) / 1e6) - log_scene_creation_time(benchmark, Timer.get_timer_info("scene_creation") * 1000) - log_simulation_start_time(benchmark, Timer.get_timer_info("simulation_start") * 1000) + + # Timer may not be available in standalone mode + scene_creation_time = get_timer_value("scene_creation") + simulation_start_time = get_timer_value("simulation_start") + + log_scene_creation_time(benchmark, scene_creation_time * 1000 if scene_creation_time else None) + log_simulation_start_time(benchmark, simulation_start_time * 1000 if simulation_start_time else None) log_total_start_time(benchmark, (task_startup_time_end - app_start_time_begin) / 1e6) log_runtime_step_times(benchmark, rl_training_times, compute_stats=True) - log_rl_policy_rewards(benchmark, log_data["rewards/iter"]) - log_rl_policy_episode_lengths(benchmark, log_data["episode_lengths/iter"]) + log_rl_policy_rewards(benchmark, log_data.get("rewards/iter", [])) + log_rl_policy_episode_lengths(benchmark, log_data.get("episode_lengths/iter", [])) benchmark.stop() @@ -258,4 +321,5 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg, agent_cfg: dict): # run the main function main() # close sim app - simulation_app.close() + if simulation_app is not None: + simulation_app.close() diff --git a/scripts/benchmarks/benchmark_rsl_rl.py b/scripts/benchmarks/benchmark_rsl_rl.py index 8d853e28853..f70cd2ef15a 100644 --- a/scripts/benchmarks/benchmark_rsl_rl.py +++ b/scripts/benchmarks/benchmark_rsl_rl.py @@ -3,10 +3,6 @@ # # SPDX-License-Identifier: BSD-3-Clause -# Copyright (c) 2022-2025, The IsaacLab Project Developers. -# All rights reserved. -# -# SPDX-License-Identifier: BSD-3-Clause """Script to benchmark RL agent with RSL-RL.""" @@ -17,13 +13,10 @@ import sys import time -from isaaclab.app import AppLauncher - sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../..")) -import scripts.reinforcement_learning.rsl_rl.cli_args as cli_args # isort: skip # add argparse arguments -parser = argparse.ArgumentParser(description="Train an RL agent with RSL-RL.") +parser = argparse.ArgumentParser(description="Benchmark RL agent with RSL-RL.") parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.") parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).") parser.add_argument("--video_interval", type=int, default=2000, help="Interval between video recordings (in steps).") @@ -47,11 +40,22 @@ default="/tmp", help="Output folder for the benchmark metrics.", ) +parser.add_argument( + "--kit", + action="store_true", + default=False, + help="Enable Isaac Sim Kit and use isaacsim.benchmark.services. Default: False (uses standalone benchmark).", +) + +# Conditionally add RSL-RL and AppLauncher args only if --kit is enabled +if "--kit" in sys.argv: + import scripts.reinforcement_learning.rsl_rl.cli_args as cli_args + + cli_args.add_rsl_rl_args(parser) -# append RSL-RL cli arguments -cli_args.add_rsl_rl_args(parser) -# append AppLauncher cli args -AppLauncher.add_app_launcher_args(parser) + from isaaclab.app import AppLauncher + + AppLauncher.add_app_launcher_args(parser) # to ensure kit args don't break the benchmark arg parsing args_cli, hydra_args = parser.parse_known_args() @@ -65,9 +69,17 @@ app_start_time_begin = time.perf_counter_ns() -# launch omniverse app -app_launcher = AppLauncher(args_cli) -simulation_app = app_launcher.app +# Conditionally launch Isaac Sim +simulation_app = None +app_launcher = None +if args_cli.kit: + # Force Omniverse mode by setting environment variable + # This ensures SimulationApp is launched even without explicit visualizers + os.environ["LAUNCH_OV_APP"] = "1" + + # launch omniverse app + app_launcher = AppLauncher(args_cli) + simulation_app = app_launcher.app app_start_time_end = time.perf_counter_ns() @@ -81,10 +93,6 @@ from rsl_rl.runners import OnPolicyRunner -from isaaclab.utils.timer import Timer - -Timer.enable_display_output = False - from isaaclab.envs import DirectRLEnvCfg, ManagerBasedRLEnvCfg from isaaclab.utils.dict import print_dict from isaaclab.utils.io import dump_pickle, dump_yaml @@ -97,37 +105,49 @@ imports_time_end = time.perf_counter_ns() -from isaacsim.core.utils.extensions import enable_extension - -enable_extension("isaacsim.benchmark.services") - -# Set the benchmark settings according to the inputs -import carb - -settings = carb.settings.get_settings() -settings.set("/exts/isaacsim.benchmark.services/metrics/metrics_output_folder", args_cli.output_folder) -settings.set("/exts/isaacsim.benchmark.services/metrics/randomize_filename_prefix", True) - - -from isaacsim.benchmark.services import BaseIsaacBenchmark - -from scripts.benchmarks.utils import ( - get_isaaclab_version, - get_mujoco_warp_version, - get_newton_version, - log_app_start_time, - log_newton_finalize_builder_time, - log_newton_initialize_solver_time, - log_python_imports_time, - log_rl_policy_episode_lengths, - log_rl_policy_rewards, - log_runtime_step_times, - log_scene_creation_time, - log_simulation_start_time, - log_task_start_time, - log_total_start_time, - parse_tf_logs, -) +# Import benchmark infrastructure based on kit flag +if args_cli.kit: + from isaacsim.core.utils.extensions import enable_extension + + enable_extension("isaacsim.benchmark.services") + + # Set the benchmark settings according to the inputs + import carb + + settings = carb.settings.get_settings() + settings.set("/exts/isaacsim.benchmark.services/metrics/metrics_output_folder", args_cli.output_folder) + settings.set("/exts/isaacsim.benchmark.services/metrics/randomize_filename_prefix", True) + + from isaacsim.benchmark.services import BaseIsaacBenchmark + + from scripts.benchmarks.utils.benchmark_utils import create_kit_logging_functions, get_timer_value + + # Get all logging functions for kit mode + log_funcs = create_kit_logging_functions() +else: + # Use standalone benchmark services + from scripts.benchmarks.utils.benchmark_utils import create_standalone_logging_functions, get_timer_value + from scripts.benchmarks.utils.standalone_benchmark import StandaloneBenchmark + + # Get all logging functions for standalone mode + log_funcs = create_standalone_logging_functions() + +# Extract individual functions from the dictionary for easier use +get_isaaclab_version = log_funcs["get_isaaclab_version"] +get_mujoco_warp_version = log_funcs["get_mujoco_warp_version"] +get_newton_version = log_funcs["get_newton_version"] +log_app_start_time = log_funcs["log_app_start_time"] +log_python_imports_time = log_funcs["log_python_imports_time"] +log_task_start_time = log_funcs["log_task_start_time"] +log_scene_creation_time = log_funcs["log_scene_creation_time"] +log_simulation_start_time = log_funcs["log_simulation_start_time"] +log_newton_finalize_builder_time = log_funcs["log_newton_finalize_builder_time"] +log_newton_initialize_solver_time = log_funcs["log_newton_initialize_solver_time"] +log_total_start_time = log_funcs["log_total_start_time"] +log_runtime_step_times = log_funcs["log_runtime_step_times"] +log_rl_policy_rewards = log_funcs["log_rl_policy_rewards"] +log_rl_policy_episode_lengths = log_funcs["log_rl_policy_episode_lengths"] +parse_tf_logs = log_funcs["parse_tf_logs"] torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True @@ -135,30 +155,57 @@ torch.backends.cudnn.benchmark = False # Create the benchmark -benchmark = BaseIsaacBenchmark( - benchmark_name="benchmark_rsl_rl_train", - workflow_metadata={ - "metadata": [ - {"name": "task", "data": args_cli.task}, - {"name": "seed", "data": args_cli.seed}, - {"name": "num_envs", "data": args_cli.num_envs}, - {"name": "max_iterations", "data": args_cli.max_iterations}, - {"name": "Mujoco Warp Info", "data": get_mujoco_warp_version()}, - {"name": "Isaac Lab Info", "data": get_isaaclab_version()}, - {"name": "Newton Info", "data": get_newton_version()}, - ], - }, - backend_type=args_cli.benchmark_backend, -) +if args_cli.kit: + benchmark = BaseIsaacBenchmark( + benchmark_name="benchmark_rsl_rl_train", + workflow_metadata={ + "metadata": [ + {"name": "task", "data": args_cli.task}, + {"name": "seed", "data": args_cli.seed}, + {"name": "num_envs", "data": args_cli.num_envs}, + {"name": "max_iterations", "data": args_cli.max_iterations}, + {"name": "Mujoco Warp Info", "data": get_mujoco_warp_version()}, + {"name": "Isaac Lab Info", "data": get_isaaclab_version()}, + {"name": "Newton Info", "data": get_newton_version()}, + ], + }, + backend_type=args_cli.benchmark_backend, + ) +else: + benchmark = StandaloneBenchmark( + benchmark_name="benchmark_rsl_rl_train", + workflow_metadata={ + "metadata": [ + {"name": "task", "data": args_cli.task}, + {"name": "seed", "data": args_cli.seed}, + {"name": "num_envs", "data": args_cli.num_envs}, + {"name": "max_iterations", "data": args_cli.max_iterations}, + {"name": "Mujoco Warp Info", "data": get_mujoco_warp_version()}, + {"name": "Isaac Lab Info", "data": get_isaaclab_version()}, + {"name": "Newton Info", "data": get_newton_version()}, + ], + }, + backend_type=args_cli.benchmark_backend, + output_folder=args_cli.output_folder, + randomize_filename_prefix=True, + ) @hydra_task_config(args_cli.task, "rsl_rl_cfg_entry_point") def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg, agent_cfg: RslRlOnPolicyRunnerCfg): """Train with RSL-RL agent.""" # parse configuration - benchmark.set_phase("loading", start_recording_frametime=False, start_recording_runtime=True) + if args_cli.kit: + benchmark.set_phase("loading", start_recording_frametime=False, start_recording_runtime=True) + else: + benchmark.set_phase("loading", start_recording_frametime=False, start_recording_runtime=False) + # override configurations with non-hydra CLI arguments - agent_cfg = cli_args.update_rsl_rl_cfg(agent_cfg, args_cli) + if args_cli.kit: + import scripts.reinforcement_learning.rsl_rl.cli_args as cli_args + + agent_cfg = cli_args.update_rsl_rl_cfg(agent_cfg, args_cli) + env_cfg.scene.num_envs = args_cli.num_envs if args_cli.num_envs is not None else env_cfg.scene.num_envs agent_cfg.max_iterations = ( args_cli.max_iterations if args_cli.max_iterations is not None else agent_cfg.max_iterations @@ -167,21 +214,25 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg, agent_cfg: RslRlOnPolic # set the environment seed # note: certain randomizations occur in the environment initialization so we set the seed here env_cfg.seed = agent_cfg.seed - env_cfg.sim.device = args_cli.device if args_cli.device is not None else env_cfg.sim.device + if args_cli.kit and hasattr(args_cli, "device") and args_cli.device is not None: + env_cfg.sim.device = args_cli.device # multi-gpu training configuration world_rank = 0 world_size = 1 if args_cli.distributed: - env_cfg.sim.device = f"cuda:{app_launcher.local_rank}" - agent_cfg.device = f"cuda:{app_launcher.local_rank}" - - # set seed to have diversity in different threads - seed = agent_cfg.seed + app_launcher.local_rank - env_cfg.seed = seed - agent_cfg.seed = seed - world_rank = app_launcher.global_rank - world_size = int(os.getenv("WORLD_SIZE", 1)) + if args_cli.kit: + env_cfg.sim.device = f"cuda:{app_launcher.local_rank}" + agent_cfg.device = f"cuda:{app_launcher.local_rank}" + + # set seed to have diversity in different threads + seed = agent_cfg.seed + app_launcher.local_rank + env_cfg.seed = seed + agent_cfg.seed = seed + world_rank = app_launcher.global_rank + world_size = int(os.getenv("WORLD_SIZE", 1)) + else: + print("[WARNING] Distributed mode is only supported with --kit flag.") # specify directory for logging experiments log_root_path = os.path.join("logs", "rsl_rl", agent_cfg.experiment_name) @@ -252,30 +303,41 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg, agent_cfg: RslRlOnPolic # prepare RL timing dict collection_fps = ( 1 - / (np.array(log_data["Perf/collection time"])) + / (np.array(log_data.get("Perf/collection time", [1]))) * env.unwrapped.num_envs * agent_cfg.num_steps_per_env * world_size ) rl_training_times = { - "Collection Time": (np.array(log_data["Perf/collection time"]) / 1000).tolist(), - "Learning Time": (np.array(log_data["Perf/learning_time"]) / 1000).tolist(), + "Collection Time": (np.array(log_data.get("Perf/collection time", [])) / 1000).tolist(), + "Learning Time": (np.array(log_data.get("Perf/learning_time", [])) / 1000).tolist(), "Collection FPS": collection_fps.tolist(), - "Total FPS": log_data["Perf/total_fps"] * world_size, + "Total FPS": [x * world_size for x in log_data.get("Perf/total_fps", [])], } # log additional metrics to benchmark services log_app_start_time(benchmark, (app_start_time_end - app_start_time_begin) / 1e6) log_python_imports_time(benchmark, (imports_time_end - imports_time_begin) / 1e6) log_task_start_time(benchmark, (task_startup_time_end - task_startup_time_begin) / 1e6) - log_scene_creation_time(benchmark, Timer.get_timer_info("scene_creation") * 1000) - log_simulation_start_time(benchmark, Timer.get_timer_info("simulation_start") * 1000) - log_newton_finalize_builder_time(benchmark, Timer.get_timer_info("newton_finalize_builder") * 1000) - log_newton_initialize_solver_time(benchmark, Timer.get_timer_info("newton_initialize_solver") * 1000) + + # Timer may not be available in standalone mode + scene_creation_time = get_timer_value("scene_creation") + simulation_start_time = get_timer_value("simulation_start") + newton_finalize_builder_time = get_timer_value("newton_finalize_builder") + newton_initialize_solver_time = get_timer_value("newton_initialize_solver") + + log_scene_creation_time(benchmark, scene_creation_time * 1000 if scene_creation_time else None) + log_simulation_start_time(benchmark, simulation_start_time * 1000 if simulation_start_time else None) + log_newton_finalize_builder_time( + benchmark, newton_finalize_builder_time * 1000 if newton_finalize_builder_time else None + ) + log_newton_initialize_solver_time( + benchmark, newton_initialize_solver_time * 1000 if newton_initialize_solver_time else None + ) log_total_start_time(benchmark, (task_startup_time_end - app_start_time_begin) / 1e6) log_runtime_step_times(benchmark, rl_training_times, compute_stats=True) - log_rl_policy_rewards(benchmark, log_data["Train/mean_reward"]) - log_rl_policy_episode_lengths(benchmark, log_data["Train/mean_episode_length"]) + log_rl_policy_rewards(benchmark, log_data.get("Train/mean_reward", [])) + log_rl_policy_episode_lengths(benchmark, log_data.get("Train/mean_episode_length", [])) benchmark.stop() @@ -287,4 +349,5 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg, agent_cfg: RslRlOnPolic # run the main function main() # close sim app - simulation_app.close() + if simulation_app is not None: + simulation_app.close() diff --git a/scripts/benchmarks/utils/README.md b/scripts/benchmarks/utils/README.md new file mode 100644 index 00000000000..7ed521a6799 --- /dev/null +++ b/scripts/benchmarks/utils/README.md @@ -0,0 +1,117 @@ +# Benchmark Utilities + +This directory contains utility modules used by the benchmark scripts. + +## Files + +### `benchmark_utils.py` +Common utility functions for both kit and standalone modes. Provides factory functions that return the appropriate logging implementations based on the mode: +- `create_kit_logging_functions()`: Returns logging functions for Isaac Sim kit mode +- `create_standalone_logging_functions()`: Returns logging functions for standalone mode +- `get_timer_value()`: Safely retrieves timer values + +### `standalone_benchmark.py` +Standalone benchmark infrastructure that replicates the functionality of `isaacsim.benchmark.services` without requiring Isaac Sim. Contains: +- Measurement classes (`SingleMeasurement`, `ListMeasurement`, `DictMeasurement`, `BooleanMeasurement`) +- Metadata classes (`StringMetadata`, `IntMetadata`, `FloatMetadata`, `DictMetadata`) +- `TestPhase` class for organizing measurements +- Backend implementations: + - `OmniPerfKPIFile`: Single JSON with all phases + - `OsmoKPIFile`: Separate JSON per phase + - `JSONFileMetrics`: Detailed JSON with full objects + - `LocalLogMetrics`: Console output only +- `StandaloneBenchmark`: Main benchmark class +- **System Metrics Collection** (requires `psutil` and optionally `GPUtil`/`pynvml`): + - CPU metrics: user, system, idle, iowait percentages + - Memory metrics: RSS, VMS, USS (in GB) + - GPU metrics: memory usage, utilization + - Runtime duration per phase + - System information: CPU count, GPU device name + +### `utils.py` +Legacy utility functions for Isaac Sim kit mode benchmarking. Contains: +- Version retrieval functions (`get_isaaclab_version`, `get_newton_version`, `get_mujoco_warp_version`) +- Logging functions for various metrics (timing, rewards, episode lengths, etc.) +- `parse_tf_logs()`: TensorBoard log parser + +## Usage + +The benchmark scripts (`benchmark_non_rl.py`, `benchmark_rlgames.py`, `benchmark_rsl_rl.py`) automatically select the appropriate utilities based on the `--kit` flag: + +```python +if args_cli.kit: + # Use Isaac Sim benchmark services + log_funcs = create_kit_logging_functions() +else: + # Use standalone benchmark services + log_funcs = create_standalone_logging_functions() +``` + +This approach eliminates code duplication across the benchmark scripts while maintaining support for both modes. + +## System Metrics Collection + +The standalone benchmark can automatically collect system metrics similar to Isaac Sim's benchmark services: + +### Collected Metrics + +**System Information (collected once):** +- Number of CPUs +- GPU device name + +**Runtime Metrics (collected per phase):** +- **Memory:** + - RSS (Resident Set Size) + - VMS (Virtual Memory Size) + - USS (Unique Set Size) +- **CPU Usage:** + - User time percentage + - System time percentage + - Idle time percentage + - I/O wait time percentage +- **GPU (if available):** + - Memory used + - Total memory + - GPU utilization percentage +- **Runtime:** Phase execution duration + +### Requirements + +- **psutil**: Required for CPU and memory metrics (automatically detected) +- **GPUtil** or **pynvml**: Optional for GPU metrics (automatically detected) +- **nvidia-smi**: Fallback for GPU metrics (usually pre-installed with NVIDIA drivers) + +The GPU detection tries three methods in order: +1. **GPUtil**: Fast Python library +2. **pynvml** (nvidia-ml-py3): Official NVIDIA library +3. **nvidia-smi**: Direct system call (most reliable, works even with driver/library version mismatch) + +Install with: +```bash +pip install psutil GPUtil +# or +pip install psutil nvidia-ml-py3 +``` + +Note: `nvidia-smi` is automatically used as a fallback and doesn't require installation (comes with NVIDIA drivers). + +### Usage + +System metrics are collected automatically by default: + +```python +benchmark = StandaloneBenchmark( + benchmark_name="my_benchmark", + collect_system_metrics=True # default +) +``` + +To disable system metrics: +```python +benchmark = StandaloneBenchmark( + benchmark_name="my_benchmark", + collect_system_metrics=False +) +``` + +The metrics will appear in your output JSON alongside your custom measurements. diff --git a/scripts/benchmarks/utils/__init__.py b/scripts/benchmarks/utils/__init__.py new file mode 100644 index 00000000000..822b13ae1a1 --- /dev/null +++ b/scripts/benchmarks/utils/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +"""Utilities for benchmarking scripts.""" diff --git a/scripts/benchmarks/utils/benchmark_utils.py b/scripts/benchmarks/utils/benchmark_utils.py new file mode 100644 index 00000000000..07a5b4dcfaa --- /dev/null +++ b/scripts/benchmarks/utils/benchmark_utils.py @@ -0,0 +1,205 @@ +# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +"""Common utility functions for benchmark scripts running in standalone or kit mode.""" + +import os + +from isaaclab.utils.timer import Timer + + +def create_kit_logging_functions(): + """Create logging functions for kit mode that use isaacsim.benchmark.services. + + Returns: + A dictionary containing all the logging function implementations for kit mode. + """ + from scripts.benchmarks.utils.utils import ( + get_isaaclab_version, + get_mujoco_warp_version, + get_newton_version, + log_app_start_time, + log_newton_finalize_builder_time, + log_newton_initialize_solver_time, + log_python_imports_time, + log_rl_policy_episode_lengths, + log_rl_policy_rewards, + log_runtime_step_times, + log_scene_creation_time, + log_simulation_start_time, + log_task_start_time, + log_total_start_time, + parse_tf_logs, + ) + + return { + "get_isaaclab_version": get_isaaclab_version, + "get_mujoco_warp_version": get_mujoco_warp_version, + "get_newton_version": get_newton_version, + "log_app_start_time": log_app_start_time, + "log_python_imports_time": log_python_imports_time, + "log_task_start_time": log_task_start_time, + "log_scene_creation_time": log_scene_creation_time, + "log_simulation_start_time": log_simulation_start_time, + "log_newton_finalize_builder_time": log_newton_finalize_builder_time, + "log_newton_initialize_solver_time": log_newton_initialize_solver_time, + "log_total_start_time": log_total_start_time, + "log_runtime_step_times": log_runtime_step_times, + "log_rl_policy_rewards": log_rl_policy_rewards, + "log_rl_policy_episode_lengths": log_rl_policy_episode_lengths, + "parse_tf_logs": parse_tf_logs, + } + + +def create_standalone_logging_functions(): # noqa: C901 + """Create logging functions for standalone mode that use standalone_benchmark. + + Returns: + A dictionary containing all the logging function implementations for standalone mode. + """ + from scripts.benchmarks.utils.standalone_benchmark import DictMeasurement, ListMeasurement, SingleMeasurement + + def get_isaaclab_version(): + try: + import isaaclab + + return {"version": isaaclab.__version__, "commit": None, "branch": None} + except Exception: + return {"version": None, "commit": None, "branch": None} + + def get_mujoco_warp_version(): + try: + import mujoco_warp + + return {"version": getattr(mujoco_warp, "__version__", None), "commit": None, "branch": None} + except Exception: + return {"version": None, "commit": None, "branch": None} + + def get_newton_version(): + try: + import newton + + return {"version": newton.__version__, "commit": None, "branch": None} + except Exception: + return {"version": None, "commit": None, "branch": None} + + def log_app_start_time(benchmark, value): + measurement = SingleMeasurement(name="App Launch Time", value=value, unit="ms") + benchmark.store_custom_measurement("startup", measurement) + + def log_python_imports_time(benchmark, value): + measurement = SingleMeasurement(name="Python Imports Time", value=value, unit="ms") + benchmark.store_custom_measurement("startup", measurement) + + def log_task_start_time(benchmark, value): + measurement = SingleMeasurement(name="Task Creation and Start Time", value=value, unit="ms") + benchmark.store_custom_measurement("startup", measurement) + + def log_scene_creation_time(benchmark, value): + if value is not None: + measurement = SingleMeasurement(name="Scene Creation Time", value=value, unit="ms") + benchmark.store_custom_measurement("startup", measurement) + + def log_simulation_start_time(benchmark, value): + if value is not None: + measurement = SingleMeasurement(name="Simulation Start Time", value=value, unit="ms") + benchmark.store_custom_measurement("startup", measurement) + + def log_newton_finalize_builder_time(benchmark, value): + if value is not None: + measurement = SingleMeasurement(name="Newton Finalize Builder Time", value=value, unit="ms") + benchmark.store_custom_measurement("startup", measurement) + + def log_newton_initialize_solver_time(benchmark, value): + if value is not None: + measurement = SingleMeasurement(name="Newton Initialize Solver Time", value=value, unit="ms") + benchmark.store_custom_measurement("startup", measurement) + + def log_total_start_time(benchmark, value): + measurement = SingleMeasurement(name="Total Start Time (Launch to Train)", value=value, unit="ms") + benchmark.store_custom_measurement("startup", measurement) + + def log_runtime_step_times(benchmark, value, compute_stats=True): + measurement = DictMeasurement(name="Step Frametimes", value=value) + benchmark.store_custom_measurement("runtime", measurement) + if compute_stats: + for k, v in value.items(): + if isinstance(v, list) and len(v) > 0: + measurement = SingleMeasurement(name=f"Min {k}", value=min(v), unit="ms") + benchmark.store_custom_measurement("runtime", measurement) + measurement = SingleMeasurement(name=f"Max {k}", value=max(v), unit="ms") + benchmark.store_custom_measurement("runtime", measurement) + measurement = SingleMeasurement(name=f"Mean {k}", value=sum(v) / len(v), unit="ms") + benchmark.store_custom_measurement("runtime", measurement) + + def log_rl_policy_rewards(benchmark, value): + measurement = ListMeasurement(name="Rewards", value=value) + benchmark.store_custom_measurement("train", measurement) + if len(value) > 0: + measurement = SingleMeasurement(name="Max Rewards", value=max(value), unit="float") + benchmark.store_custom_measurement("train", measurement) + measurement = SingleMeasurement(name="Last Reward", value=value[-1], unit="float") + benchmark.store_custom_measurement("train", measurement) + + def log_rl_policy_episode_lengths(benchmark, value): + measurement = ListMeasurement(name="Episode Lengths", value=value) + benchmark.store_custom_measurement("train", measurement) + if len(value) > 0: + measurement = SingleMeasurement(name="Max Episode Lengths", value=max(value), unit="float") + benchmark.store_custom_measurement("train", measurement) + measurement = SingleMeasurement(name="Last Episode Length", value=value[-1], unit="float") + benchmark.store_custom_measurement("train", measurement) + + def parse_tf_logs(log_dir: str): + """Parse tensorboard logs.""" + import glob + + from tensorboard.backend.event_processing import event_accumulator + + list_of_files = glob.glob(f"{log_dir}/events*") + if not list_of_files: + return {} + latest_file = max(list_of_files, key=os.path.getctime) + + log_data = {} + ea = event_accumulator.EventAccumulator(latest_file) + ea.Reload() + tags = ea.Tags()["scalars"] + for tag in tags: + log_data[tag] = [] + for event in ea.Scalars(tag): + log_data[tag].append(event.value) + return log_data + + return { + "get_isaaclab_version": get_isaaclab_version, + "get_mujoco_warp_version": get_mujoco_warp_version, + "get_newton_version": get_newton_version, + "log_app_start_time": log_app_start_time, + "log_python_imports_time": log_python_imports_time, + "log_task_start_time": log_task_start_time, + "log_scene_creation_time": log_scene_creation_time, + "log_simulation_start_time": log_simulation_start_time, + "log_newton_finalize_builder_time": log_newton_finalize_builder_time, + "log_newton_initialize_solver_time": log_newton_initialize_solver_time, + "log_total_start_time": log_total_start_time, + "log_runtime_step_times": log_runtime_step_times, + "log_rl_policy_rewards": log_rl_policy_rewards, + "log_rl_policy_episode_lengths": log_rl_policy_episode_lengths, + "parse_tf_logs": parse_tf_logs, + } + + +def get_timer_value(timer_name: str) -> float: + """Safely get timer value, returning 0 if not available. + + Args: + timer_name: Name of the timer to retrieve. + + Returns: + Timer value in seconds, or 0 if not available. + """ + value = Timer.get_timer_info(timer_name) + return value if value is not None else 0 diff --git a/scripts/benchmarks/utils/standalone_benchmark.py b/scripts/benchmarks/utils/standalone_benchmark.py new file mode 100644 index 00000000000..28ac8909d56 --- /dev/null +++ b/scripts/benchmarks/utils/standalone_benchmark.py @@ -0,0 +1,861 @@ +# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +""" +Standalone benchmark services that can run without Isaac Sim. + +This module provides a lightweight benchmarking system that mimics the functionality +of isaacsim.benchmark.services but does not depend on Isaac Sim being available. +""" + +import json +import multiprocessing +import os +import subprocess +import tempfile +import time +from contextlib import suppress +from dataclasses import dataclass, field +from datetime import datetime as dt +from pathlib import Path + +# Optional dependencies for system metrics +try: + import psutil + + PSUTIL_AVAILABLE = True +except ImportError: + PSUTIL_AVAILABLE = False + print("[WARNING] psutil not available. System metrics will not be collected. Install with: pip install psutil") + +# Try GPU libraries +GPUTIL_AVAILABLE = False +PYNVML_AVAILABLE = False + +try: + import GPUtil + + GPUTIL_AVAILABLE = True + print("[INFO] GPUtil available for GPU metrics") +except ImportError: + pass + +if not GPUTIL_AVAILABLE: + try: + import pynvml + + PYNVML_AVAILABLE = True + print("[INFO] pynvml available for GPU metrics") + except ImportError: + pass + +if not GPUTIL_AVAILABLE and not PYNVML_AVAILABLE: + print("[INFO] GPUtil/pynvml not available, will use nvidia-smi if available") + + +############################# +# Measurement Data Classes # +############################# + + +@dataclass +class Measurement: + """Base measurement class.""" + + name: str + + +@dataclass +class SingleMeasurement(Measurement): + """Represents a single float measurement.""" + + value: float + unit: str + type: str = "single" + + +@dataclass +class BooleanMeasurement(Measurement): + """Represents a boolean measurement.""" + + bvalue: bool + type: str = "boolean" + + +@dataclass +class DictMeasurement(Measurement): + """Represents a dictionary measurement.""" + + value: dict + type: str = "dict" + + +@dataclass +class ListMeasurement(Measurement): + """Represents a list measurement.""" + + value: list + type: str = "list" + + def __repr__(self): + return f"{self.__class__.__name__}(name={self.name!r}, length={len(self.value)})" + + +@dataclass +class MetadataBase: + """Base metadata class.""" + + name: str + + +@dataclass +class StringMetadata(MetadataBase): + """String metadata.""" + + data: str + type: str = "string" + + +@dataclass +class IntMetadata(MetadataBase): + """Integer metadata.""" + + data: int + type: str = "int" + + +@dataclass +class FloatMetadata(MetadataBase): + """Float metadata.""" + + data: float + type: str = "float" + + +@dataclass +class DictMetadata(MetadataBase): + """Dictionary metadata.""" + + data: dict + type: str = "dict" + + +@dataclass +class TestPhase: + """Represents a single test phase which may have many metrics associated with it.""" + + phase_name: str + measurements: list[Measurement] = field(default_factory=list) + metadata: list[MetadataBase] = field(default_factory=list) + + def get_metadata_field(self, name, default=KeyError): + """Get a metadata field's value. + + Args: + name: Field name (case-insensitive). + default: Default value if not found. If KeyError, raises exception. + + Returns: + The metadata value. + + Raises: + KeyError: If the field is not found and default is KeyError. + """ + name = name.lower() + for m in self.metadata: + name2 = m.name.replace(self.phase_name, "").strip().lower() + if name == name2: + return m.data + + if default is KeyError: + raise KeyError(name) + return default + + @classmethod + def metadata_from_dict(cls, m: dict) -> list[MetadataBase]: + """Create metadata from dictionary. + + Args: + m: Dictionary containing metadata list. + + Returns: + List of MetadataBase objects. + """ + metadata = [] + metadata_mapping = {str: StringMetadata, int: IntMetadata, float: FloatMetadata, dict: DictMetadata} + for meas in m["metadata"]: + if "data" in meas: + metadata_type = metadata_mapping.get(type(meas["data"])) + if metadata_type: + curr_meta = metadata_type(name=meas["name"], data=meas["data"]) + metadata.append(curr_meta) + return metadata + + +class TestPhaseEncoder(json.JSONEncoder): + """JSON encoder for TestPhase objects.""" + + def default(self, o): + return o.__dict__ + + +############################# +# Backend Implementation # +############################# + + +class OmniPerfKPIFile: + """Prints metrics into a JSON document compatible with OmniPerfKPIFile format.""" + + def __init__(self): + self._test_phases = [] + + def add_metrics(self, test_phase: TestPhase) -> None: + """Add a test phase to the backend. + + Args: + test_phase: The test phase to add. + """ + self._test_phases.append(test_phase) + + def finalize(self, metrics_output_folder: str, randomize_filename_prefix: bool = False) -> None: + """Write metrics to output file. + + Args: + metrics_output_folder: Output folder for metrics file. + randomize_filename_prefix: Whether to randomize the filename prefix. + """ + if not self._test_phases: + print("[WARNING] No test phases to write. Skipping metrics file generation.") + return + + workflow_data = {"timestamp": dt.now().isoformat()} + + test_name = None + for test_phase in self._test_phases: + # Retrieve useful metadata from test_phase + test_name = test_phase.get_metadata_field("workflow_name") + phase_name = test_phase.get_metadata_field("phase") + + phase_data = {} + log_statements = [f"{phase_name} Metrics:"] + # Add metadata as metrics + for metadata in test_phase.metadata: + phase_data[metadata.name] = metadata.data + log_statements.append(f" {metadata.name}: {metadata.data}") + # Add measurements as metrics + for measurement in test_phase.measurements: + if isinstance(measurement, SingleMeasurement): + log_statements.append(f" {measurement.name}: {measurement.value} {measurement.unit}") + phase_data[measurement.name] = measurement.value + elif isinstance(measurement, (DictMeasurement, ListMeasurement)): + # For dict and list measurements, store them as-is + phase_data[measurement.name] = measurement.value + # Log all metrics to console + print("\n".join(log_statements)) + + workflow_data[phase_name] = phase_data + + # Generate the output filename + if randomize_filename_prefix: + _, metrics_filename_out = tempfile.mkstemp( + dir=metrics_output_folder, prefix=f"kpis_{test_name}", suffix=".json" + ) + else: + metrics_filename_out = Path(metrics_output_folder) / f"kpis_{test_name}.json" + # Dump key-value pairs to the JSON document + json_data = json.dumps(workflow_data, indent=4) + with open(metrics_filename_out, "w") as f: + print(f"[INFO] Writing metrics to {metrics_filename_out}") + f.write(json_data) + + +class OsmoKPIFile: + """Print metrics into separate JSON documents for each phase.""" + + def __init__(self): + self._test_phases = [] + + def add_metrics(self, test_phase: TestPhase) -> None: + """Add a test phase to the backend. + + Args: + test_phase: The test phase to add. + """ + self._test_phases.append(test_phase) + + def finalize(self, metrics_output_folder: str, randomize_filename_prefix: bool = False) -> None: + """Write metrics to output files. + + Args: + metrics_output_folder: Output folder for metrics files. + randomize_filename_prefix: Whether to randomize the filename prefix. + """ + for test_phase in self._test_phases: + # Retrieve useful metadata from test_phase + test_name = test_phase.get_metadata_field("workflow_name") + phase_name = test_phase.get_metadata_field("phase") + + osmo_kpis = {} + log_statements = [f"{phase_name} KPIs:"] + # Add metadata as KPIs + for metadata in test_phase.metadata: + osmo_kpis[metadata.name] = metadata.data + log_statements.append(f" {metadata.name}: {metadata.data}") + # Add single measurements as KPIs + for measurement in test_phase.measurements: + if isinstance(measurement, SingleMeasurement): + osmo_kpis[measurement.name] = measurement.value + log_statements.append(f" {measurement.name}: {measurement.value} {measurement.unit}") + # Log all KPIs to console + print("\n".join(log_statements)) + # Generate the output filename + if randomize_filename_prefix: + _, metrics_filename_out = tempfile.mkstemp( + dir=metrics_output_folder, prefix=f"kpis_{test_name}_{phase_name}", suffix=".json" + ) + else: + metrics_filename_out = Path(metrics_output_folder) / f"kpis_{test_name}_{phase_name}.json" + # Dump key-value pairs to the JSON document + json_data = json.dumps(osmo_kpis, indent=4) + with open(metrics_filename_out, "w") as f: + f.write(json_data) + + +class JSONFileMetrics: + """Dump all metrics to a single JSON file.""" + + def __init__(self): + self.data = [] + self.test_name = None + + def add_metrics(self, test_phase: TestPhase) -> None: + """Add a test phase to the backend. + + Args: + test_phase: The test phase to add. + """ + self.data.append(test_phase) + + def finalize(self, metrics_output_folder: str, randomize_filename_prefix: bool = False) -> None: + """Write metrics to output file. + + Args: + metrics_output_folder: Output folder for metrics file. + randomize_filename_prefix: Whether to randomize the filename prefix. + """ + if not self.data: + print("[WARNING] No test data to write. Skipping metrics file generation.") + return + + # Get test name + for test_phase in self.data: + test_name = test_phase.get_metadata_field("workflow_name") + if test_name != self.test_name: + if self.test_name: + print( + f"[WARNING] Nonempty test name {self.test_name} different from name {test_name} provided by" + " test phase." + ) + self.test_name = test_name + + phase_name = test_phase.get_metadata_field("phase") + for m in test_phase.measurements: + m.name = f"{test_name} {phase_name} {m.name}" + + for m in test_phase.metadata: + m.name = f"{test_name} {phase_name} {m.name}" + + json_data = json.dumps(self.data, indent=4, cls=TestPhaseEncoder) + + # Generate the output filename + if randomize_filename_prefix: + _, metrics_filename_out = tempfile.mkstemp( + dir=metrics_output_folder, prefix=f"metrics_{self.test_name}", suffix=".json" + ) + else: + metrics_filename_out = Path(metrics_output_folder) / f"metrics_{self.test_name}.json" + + with open(metrics_filename_out, "w") as f: + print(f"[INFO] Writing metrics to {metrics_filename_out}") + f.write(json_data) + + self.data.clear() + + +class LocalLogMetrics: + """Simple backend that just logs metrics to console.""" + + def __init__(self): + self._test_phases = [] + + def add_metrics(self, test_phase: TestPhase) -> None: + """Add a test phase to the backend. + + Args: + test_phase: The test phase to add. + """ + self._test_phases.append(test_phase) + + def finalize(self, metrics_output_folder: str, randomize_filename_prefix: bool = False) -> None: + """Log metrics to console. + + Args: + metrics_output_folder: Not used for this backend. + randomize_filename_prefix: Not used for this backend. + """ + for test_phase in self._test_phases: + test_name = test_phase.get_metadata_field("workflow_name") + phase_name = test_phase.get_metadata_field("phase") + + print(f"\n{'=' * 60}") + print(f"Benchmark: {test_name} - Phase: {phase_name}") + print(f"{'=' * 60}") + + print("\nMetadata:") + for metadata in test_phase.metadata: + print(f" {metadata.name}: {metadata.data}") + + print("\nMeasurements:") + for measurement in test_phase.measurements: + if isinstance(measurement, SingleMeasurement): + print(f" {measurement.name}: {measurement.value} {measurement.unit}") + elif isinstance(measurement, (DictMeasurement, ListMeasurement)): + print( + f" {measurement.name}: {type(measurement.value).__name__} with {len(measurement.value)} items" + ) + + +class MetricsBackend: + """Factory for creating metrics backends.""" + + @staticmethod + def get_instance(instance_type: str): + """Get a metrics backend instance. + + Args: + instance_type: Type of backend ("OmniPerfKPIFile", "OsmoKPIFile", "JSONFileMetrics", "LocalLogMetrics"). + + Returns: + An instance of the requested backend. + + Raises: + ValueError: If instance_type is not recognized. + """ + if instance_type == "OmniPerfKPIFile": + return OmniPerfKPIFile() + elif instance_type == "OsmoKPIFile": + return OsmoKPIFile() + elif instance_type == "JSONFileMetrics": + return JSONFileMetrics() + elif instance_type == "LocalLogMetrics": + return LocalLogMetrics() + else: + raise ValueError(f"Unknown backend type: {instance_type}") + + +############################# +# Benchmark Class # +############################# + + +class StandaloneBenchmark: + """Standalone benchmark class that works without Isaac Sim. + + This class mimics the functionality of BaseIsaacBenchmark but does not + depend on Isaac Sim or its benchmark services extension. + """ + + def __init__( + self, + benchmark_name: str = "StandaloneBenchmark", + backend_type: str = "OmniPerfKPIFile", + workflow_metadata: dict = {}, + output_folder: str | None = None, + randomize_filename_prefix: bool = False, + collect_system_metrics: bool = True, + ): + """Initialize the standalone benchmark. + + Args: + benchmark_name: Name of the benchmark. + backend_type: Type of backend to use for metrics collection. + workflow_metadata: Metadata describing the benchmark. + output_folder: Output folder for metrics files. If None, uses temp directory. + randomize_filename_prefix: Whether to randomize the filename prefix. + collect_system_metrics: Whether to collect system metrics (CPU, memory, GPU). + """ + self.benchmark_name = benchmark_name + self._test_phases = [] + self._current_phase = None + self._collect_system_metrics = collect_system_metrics and PSUTIL_AVAILABLE + + # System metrics tracking + self._phase_start_time = None + self._process = psutil.Process() if PSUTIL_AVAILABLE else None + + # Check if nvidia-smi is available + self._nvidia_smi_available = self._check_nvidia_smi() + + # Get metrics backend + self._metrics = MetricsBackend.get_instance(instance_type=backend_type) + + # Set output folder + if output_folder is None: + self._metrics_output_folder = tempfile.gettempdir() + else: + self._metrics_output_folder = output_folder + + self._randomize_filename_prefix = randomize_filename_prefix + + # Generate workflow-level metadata + self._metadata = [StringMetadata(name="workflow_name", data=self.benchmark_name)] + if "metadata" in workflow_metadata: + self._metadata.extend(TestPhase.metadata_from_dict(workflow_metadata)) + elif workflow_metadata: + print( + "[WARNING] workflow_metadata provided, but missing expected 'metadata' entry. Metadata will not be" + " read." + ) + + print(f"[INFO] Benchmark initialized: {self.benchmark_name}") + print(f"[INFO] Output folder: {self._metrics_output_folder}") + print(f"[INFO] Backend type: {backend_type}") + print(f"[INFO] System metrics collection: {'enabled' if self._collect_system_metrics else 'disabled'}") + if self._nvidia_smi_available: + print("[INFO] nvidia-smi available for direct GPU queries") + self.benchmark_start_time = time.time() + + def _check_nvidia_smi(self) -> bool: + """Check if nvidia-smi is available. + + Returns: + True if nvidia-smi is available and working. + """ + try: + result = subprocess.run( + ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], capture_output=True, text=True, timeout=2 + ) + return result.returncode == 0 + except (FileNotFoundError, subprocess.TimeoutExpired, Exception): + return False + + def _get_gpu_info_nvidia_smi(self) -> dict[str, any] | None: + """Get GPU information using nvidia-smi directly. + + Returns: + Dictionary with GPU info or None if failed. + """ + try: + # Query multiple fields at once + result = subprocess.run( + [ + "nvidia-smi", + "--query-gpu=name,memory.used,memory.total,utilization.gpu", + "--format=csv,noheader,nounits", + ], + capture_output=True, + text=True, + timeout=2, + ) + + if result.returncode == 0: + line = result.stdout.strip().split("\n")[0] # Get first GPU + parts = [p.strip() for p in line.split(",")] + if len(parts) >= 4: + return { + "name": parts[0], + "memory_used_mb": float(parts[1]), + "memory_total_mb": float(parts[2]), + "utilization": float(parts[3]), + } + except Exception as e: + print(f"[WARNING] nvidia-smi query failed: {e}") + + return None + + def _collect_system_info(self) -> list[MetadataBase]: + """Collect system information as metadata. + + Returns: + List of metadata objects with system information. + """ + metadata = [] + + if not self._collect_system_metrics: + return metadata + + # CPU count + metadata.append(IntMetadata(name="num_cpus", data=multiprocessing.cpu_count())) + + # GPU information - try multiple methods + gpu_detected = False + + # Method 1: Try GPUtil + if GPUTIL_AVAILABLE and not gpu_detected: + try: + gpus = GPUtil.getGPUs() + if gpus and len(gpus) > 0: + metadata.append(StringMetadata(name="gpu_device_name", data=gpus[0].name)) + gpu_detected = True + print(f"[INFO] GPU detected via GPUtil: {gpus[0].name}") + except Exception as e: + print(f"[WARNING] Failed to get GPU info via GPUtil: {e}") + + # Method 2: Try pynvml + if PYNVML_AVAILABLE and not gpu_detected: + try: + pynvml.nvmlInit() + device_count = pynvml.nvmlDeviceGetCount() + if device_count > 0: + handle = pynvml.nvmlDeviceGetHandleByIndex(0) + gpu_name = pynvml.nvmlDeviceGetName(handle) + if isinstance(gpu_name, bytes): + gpu_name = gpu_name.decode("utf-8") + metadata.append(StringMetadata(name="gpu_device_name", data=gpu_name)) + gpu_detected = True + print(f"[INFO] GPU detected via pynvml: {gpu_name}") + pynvml.nvmlShutdown() + except Exception as e: + print(f"[WARNING] Failed to get GPU info via pynvml: {e}") + with suppress(Exception): + pynvml.nvmlShutdown() + + # Method 3: Try nvidia-smi directly (most reliable, bypasses driver mismatch) + if self._nvidia_smi_available and not gpu_detected: + try: + gpu_info = self._get_gpu_info_nvidia_smi() + if gpu_info: + metadata.append(StringMetadata(name="gpu_device_name", data=gpu_info["name"])) + gpu_detected = True + print(f"[INFO] GPU detected via nvidia-smi: {gpu_info['name']}") + except Exception as e: + print(f"[WARNING] Failed to get GPU info via nvidia-smi: {e}") + + if not gpu_detected: + print("[WARNING] No GPU detected. GPU metrics will not be available.") + print( + "[INFO] This is likely due to NVIDIA driver issues (Driver/library version mismatch or error code 18)." + ) + print( + "[INFO] To fix: 1) sudo reboot (reloads drivers) 2) Check nvidia-smi manually 3) Reinstall NVIDIA" + " drivers" + ) + print("[INFO] The benchmark will continue without GPU metrics.") + + return metadata + + def _collect_runtime_metrics(self) -> list[Measurement]: + """Collect runtime system metrics. + + Returns: + List of measurement objects with runtime metrics. + """ + measurements = [] + + if not self._collect_system_metrics: + return measurements + + try: + # Memory metrics (in GB) + mem_info = self._process.memory_info() + measurements.append(SingleMeasurement(name="System Memory RSS", value=mem_info.rss / (1024**3), unit="GB")) + measurements.append(SingleMeasurement(name="System Memory VMS", value=mem_info.vms / (1024**3), unit="GB")) + + # USS (Unique Set Size) if available + try: + mem_full = self._process.memory_full_info() + measurements.append( + SingleMeasurement(name="System Memory USS", value=mem_full.uss / (1024**3), unit="GB") + ) + except (AttributeError, psutil.AccessDenied): + pass + + # CPU usage + cpu_times = psutil.cpu_times_percent(interval=0.1) + measurements.append(SingleMeasurement(name="System CPU user", value=cpu_times.user, unit="%")) + measurements.append(SingleMeasurement(name="System CPU system", value=cpu_times.system, unit="%")) + measurements.append(SingleMeasurement(name="System CPU idle", value=cpu_times.idle, unit="%")) + if hasattr(cpu_times, "iowait"): + measurements.append(SingleMeasurement(name="System CPU iowait", value=cpu_times.iowait, unit="%")) + + # Runtime duration + if self._phase_start_time is not None: + runtime_ms = (time.time() - self._phase_start_time) * 1000 + measurements.append(SingleMeasurement(name="Runtime", value=runtime_ms, unit="ms")) + + # GPU metrics - try multiple methods + gpu_metrics_collected = False + + # Method 1: Try GPUtil + if GPUTIL_AVAILABLE and not gpu_metrics_collected: + try: + gpus = GPUtil.getGPUs() + if gpus and len(gpus) > 0: + gpu = gpus[0] + measurements.append( + SingleMeasurement(name="GPU Memory Tracked", value=gpu.memoryUsed / 1024, unit="GB") + ) + measurements.append( + SingleMeasurement(name="GPU Memory Dedicated", value=gpu.memoryTotal / 1024, unit="GB") + ) + measurements.append(SingleMeasurement(name="GPU Utilization", value=gpu.load * 100, unit="%")) + gpu_metrics_collected = True + except Exception as e: + print(f"[WARNING] Failed to collect GPU metrics via GPUtil: {e}") + + # Method 2: Try pynvml + if PYNVML_AVAILABLE and not gpu_metrics_collected: + try: + pynvml.nvmlInit() + device_count = pynvml.nvmlDeviceGetCount() + if device_count > 0: + handle = pynvml.nvmlDeviceGetHandleByIndex(0) + mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) + util_rates = pynvml.nvmlDeviceGetUtilizationRates(handle) + measurements.append( + SingleMeasurement(name="GPU Memory Tracked", value=mem_info.used / (1024**3), unit="GB") + ) + measurements.append( + SingleMeasurement(name="GPU Memory Dedicated", value=mem_info.total / (1024**3), unit="GB") + ) + measurements.append(SingleMeasurement(name="GPU Utilization", value=util_rates.gpu, unit="%")) + gpu_metrics_collected = True + pynvml.nvmlShutdown() + except Exception as e: + print(f"[WARNING] Failed to collect GPU metrics via pynvml: {e}") + with suppress(Exception): + pynvml.nvmlShutdown() + + # Method 3: Try nvidia-smi directly (most reliable, bypasses driver mismatch) + if self._nvidia_smi_available and not gpu_metrics_collected: + try: + gpu_info = self._get_gpu_info_nvidia_smi() + if gpu_info: + measurements.append( + SingleMeasurement( + name="GPU Memory Tracked", value=gpu_info["memory_used_mb"] / 1024, unit="GB" + ) + ) + measurements.append( + SingleMeasurement( + name="GPU Memory Dedicated", value=gpu_info["memory_total_mb"] / 1024, unit="GB" + ) + ) + measurements.append( + SingleMeasurement(name="GPU Utilization", value=gpu_info["utilization"], unit="%") + ) + gpu_metrics_collected = True + print("[INFO] Collected GPU metrics via nvidia-smi") + except Exception as e: + print(f"[WARNING] Failed to collect GPU metrics via nvidia-smi: {e}") + + if not gpu_metrics_collected: + if GPUTIL_AVAILABLE or PYNVML_AVAILABLE or self._nvidia_smi_available: + print("[WARNING] GPU libraries/tools available but all methods failed to collect metrics.") + print("[INFO] This usually indicates NVIDIA driver issues (Driver/library mismatch or GPU lost).") + print("[INFO] To fix: sudo reboot (reloads drivers and libraries)") + else: + print("[INFO] No GPU libraries available. GPU metrics will not be collected.") + + except Exception as e: + print(f"[WARNING] Failed to collect runtime metrics: {e}") + import traceback + + traceback.print_exc() + + return measurements + + def set_phase( + self, phase: str, start_recording_frametime: bool = True, start_recording_runtime: bool = True + ) -> None: + """Set the current benchmarking phase. + + Args: + phase: Name of the phase. + start_recording_frametime: Not used in standalone version (for API compatibility). + start_recording_runtime: Not used in standalone version (for API compatibility). + """ + print(f"[INFO] Starting phase: {phase}") + self._current_phase = phase + self._phase_start_time = time.time() + + def store_measurements(self) -> None: + """Store measurements for the current phase. + + This method should be called after completing work in a phase and before + setting a new phase or calling stop(). + """ + if self._current_phase is None: + print("[WARNING] No phase set. Call set_phase() before store_measurements().") + return + + # Create a new test phase + test_phase = TestPhase(phase_name=self._current_phase, measurements=[], metadata=[]) + + # Collect system info metadata (only for first phase or if explicitly needed) + if len(self._test_phases) == 0: + system_metadata = self._collect_system_info() + test_phase.metadata.extend(system_metadata) + + # Collect runtime metrics + runtime_measurements = self._collect_runtime_metrics() + test_phase.measurements.extend(runtime_measurements) + + # Update test phase metadata with phase name and benchmark metadata + test_phase.metadata.extend(self._metadata) + test_phase.metadata.append(StringMetadata(name="phase", data=self._current_phase)) + self._test_phases.append(test_phase) + + print(f"[INFO] Stored measurements for phase: {self._current_phase}") + + def store_custom_measurement(self, phase_name: str, custom_measurement: Measurement) -> None: + """Store a custom measurement for a specific phase. + + Args: + phase_name: Name of the phase. + custom_measurement: The measurement to store. + """ + # Check if the phase already exists + existing_phase = next((phase for phase in self._test_phases if phase.phase_name == phase_name), None) + + if existing_phase: + # Add the custom measurement to the existing phase + existing_phase.measurements.append(custom_measurement) + else: + # If the phase does not exist, create a new test phase + new_test_phase = TestPhase(phase_name=phase_name, measurements=[custom_measurement], metadata=[]) + # Update test phase metadata with phase name and benchmark metadata + new_test_phase.metadata.extend(self._metadata) + new_test_phase.metadata.append(StringMetadata(name="phase", data=phase_name)) + + # Add the new test phase to the list of test phases + self._test_phases.append(new_test_phase) + + def stop(self): + """Stop benchmarking and write accumulated metrics to file.""" + print("[INFO] Stopping benchmark") + + if not self._test_phases: + print( + "[WARNING] No test phases collected. After set_phase(), store_measurements() should be called. " + "No metrics will be written." + ) + return + + # Create output folder if it doesn't exist + if not os.path.exists(self._metrics_output_folder): + os.makedirs(self._metrics_output_folder, exist_ok=True) + + print("[INFO] Writing metrics data.") + + # Finalize by adding all test phases to the backend metrics + for test_phase in self._test_phases: + self._metrics.add_metrics(test_phase) + + self._metrics.finalize(self._metrics_output_folder, self._randomize_filename_prefix) + + elapsed_time = time.time() - self.benchmark_start_time + print(f"[INFO] Benchmark completed in {elapsed_time:.2f} seconds") diff --git a/scripts/benchmarks/utils/test_gpu_detection.py b/scripts/benchmarks/utils/test_gpu_detection.py new file mode 100755 index 00000000000..97555dceaaf --- /dev/null +++ b/scripts/benchmarks/utils/test_gpu_detection.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +"""Test GPU detection for standalone benchmark.""" + +from contextlib import suppress + +print("Testing GPU detection methods...\n") + +# Test GPUtil +print("=" * 60) +print("Testing GPUtil:") +print("=" * 60) +try: + import GPUtil + + print("✓ GPUtil is installed") + gpus = GPUtil.getGPUs() + if gpus: + print(f"✓ Found {len(gpus)} GPU(s)") + for i, gpu in enumerate(gpus): + print(f" GPU {i}: {gpu.name}") + print(f" Memory Used: {gpu.memoryUsed:.2f} MB") + print(f" Memory Total: {gpu.memoryTotal:.2f} MB") + print(f" Load: {gpu.load * 100:.1f}%") + else: + print("✗ No GPUs found via GPUtil") +except ImportError: + print("✗ GPUtil not installed. Install with: pip install GPUtil") +except Exception as e: + print(f"✗ Error using GPUtil: {e}") + +print() + +# Test pynvml +print("=" * 60) +print("Testing pynvml (nvidia-ml-py3):") +print("=" * 60) +try: + import pynvml + + print("✓ pynvml is installed") + pynvml.nvmlInit() + device_count = pynvml.nvmlDeviceGetCount() + print(f"✓ Found {device_count} GPU(s)") + + for i in range(device_count): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + name = pynvml.nvmlDeviceGetName(handle) + if isinstance(name, bytes): + name = name.decode("utf-8") + mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) + util_rates = pynvml.nvmlDeviceGetUtilizationRates(handle) + + print(f" GPU {i}: {name}") + print(f" Memory Used: {mem_info.used / (1024**2):.2f} MB") + print(f" Memory Total: {mem_info.total / (1024**2):.2f} MB") + print(f" GPU Utilization: {util_rates.gpu}%") + print(f" Memory Utilization: {util_rates.memory}%") + + pynvml.nvmlShutdown() +except ImportError: + print("✗ pynvml not installed. Install with: pip install nvidia-ml-py3") +except Exception as e: + print(f"✗ Error using pynvml: {e}") + with suppress(Exception): + pynvml.nvmlShutdown() + +print() + +# Test nvidia-smi directly +print("=" * 60) +print("Testing nvidia-smi (direct system call):") +print("=" * 60) +try: + import subprocess + + result = subprocess.run( + ["nvidia-smi", "--query-gpu=name,memory.used,memory.total,utilization.gpu", "--format=csv,noheader,nounits"], + capture_output=True, + text=True, + timeout=2, + ) + if result.returncode == 0: + print("✓ nvidia-smi is available") + lines = result.stdout.strip().split("\n") + for i, line in enumerate(lines): + parts = [p.strip() for p in line.split(",")] + if len(parts) >= 4: + print(f" GPU {i}: {parts[0]}") + print(f" Memory Used: {float(parts[1]):.2f} MB") + print(f" Memory Total: {float(parts[2]):.2f} MB") + print(f" GPU Utilization: {float(parts[3]):.1f}%") + else: + print(f"✗ nvidia-smi returned error code: {result.returncode}") + print(f" stderr: {result.stderr}") +except FileNotFoundError: + print("✗ nvidia-smi not found. NVIDIA drivers may not be installed.") +except subprocess.TimeoutExpired: + print("✗ nvidia-smi timed out") +except Exception as e: + print(f"✗ Error running nvidia-smi: {e}") + +print() + +# Test torch CUDA +print("=" * 60) +print("Testing PyTorch CUDA:") +print("=" * 60) +try: + import torch + + print("✓ PyTorch is installed") + if torch.cuda.is_available(): + print("✓ CUDA is available") + print(f" Device count: {torch.cuda.device_count()}") + for i in range(torch.cuda.device_count()): + print(f" GPU {i}: {torch.cuda.get_device_name(i)}") + mem_allocated = torch.cuda.memory_allocated(i) / (1024**3) + mem_reserved = torch.cuda.memory_reserved(i) / (1024**3) + print(f" Memory Allocated: {mem_allocated:.2f} GB") + print(f" Memory Reserved: {mem_reserved:.2f} GB") + else: + print("✗ CUDA not available in PyTorch") +except ImportError: + print("✗ PyTorch not installed") +except Exception as e: + print(f"✗ Error using PyTorch: {e}") + +print() +print("=" * 60) +print("Summary:") +print("=" * 60) +print("GPU Detection Methods (in priority order):") +print("1. GPUtil - Python library, fast but affected by driver mismatch") +print("2. pynvml - Official NVIDIA library, affected by driver mismatch") +print("3. PyTorch CUDA - Shows PyTorch-allocated memory (may be 0)") +print("4. nvidia-smi - Direct system call, MOST RELIABLE, bypasses driver issues") +print() +print("If no GPUs were detected, possible reasons:") +print("1. No NVIDIA GPU in the system") +print("2. NVIDIA drivers not installed (check: nvidia-smi)") +print("3. Driver/library version mismatch (ERROR: 'Driver/library version mismatch')") +print(" → FIX: sudo reboot (reloads driver and libraries)") +print("4. GPU libraries not installed") +print("5. Permission issues (try: sudo usermod -a -G video $USER)") +print() +print("Best Solution for 'Driver/library version mismatch':") +print(" → Use nvidia-smi method (Method 4) - works despite mismatch") +print(" → Or reboot system to sync driver/library versions") +print() +print("Recommended install command:") +print(" pip install psutil GPUtil nvidia-ml-py3") +print() +print("Note: PyTorch is usually already installed in Isaac Lab environments") +print(" and nvidia-smi is the most reliable fallback.") diff --git a/scripts/benchmarks/utils.py b/scripts/benchmarks/utils/utils.py similarity index 100% rename from scripts/benchmarks/utils.py rename to scripts/benchmarks/utils/utils.py