From 0f3045bb2c4899ea4233b68a0a666b8ae8e18c74 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Tue, 9 Dec 2025 09:39:10 -0600 Subject: [PATCH 1/2] Move docstrings from PYI file to implementation Added tests that docstrings exist and are not empty. This closes #291 --- python/cuda/bench/__init__.py | 4 + python/cuda/bench/__init__.pyi | 319 ++-------- python/src/py_nvbench.cpp | 1069 ++++++++++++++++++++++---------- python/test/test_nvbench.py | 51 ++ 4 files changed, 855 insertions(+), 588 deletions(-) diff --git a/python/cuda/bench/__init__.py b/python/cuda/bench/__init__.py index e1d2282a..8444b501 100644 --- a/python/cuda/bench/__init__.py +++ b/python/cuda/bench/__init__.py @@ -57,3 +57,7 @@ ) del load_nvidia_dynamic_lib + +__doc__ = """ +CUDA Kernel Benchmarking Library Python API +""" diff --git a/python/cuda/bench/__init__.pyi b/python/cuda/bench/__init__.pyi index 86681fc4..76ebcdc4 100644 --- a/python/cuda/bench/__init__.pyi +++ b/python/cuda/bench/__init__.pyi @@ -29,223 +29,74 @@ from collections.abc import Callable, Sequence from typing import Optional, Self, SupportsFloat, SupportsInt, Union class CudaStream: - """Represents CUDA stream - - Note - ---- - The class is not user-constructible. - """ - def __cuda_stream__(self) -> tuple[int, int]: - """ - Special method implement CUDA stream protocol - from `cuda.core`. Returns a pair of integers: - (protocol_version, integral_value_of_cudaStream_t pointer) - - Example - ------- - import cuda.core.experimental as core - import cuda.bench as bench - - def bench(state: bench.State): - dev = core.Device(state.get_device()) - dev.set_current() - # converts CudaString to core.Stream - # using __cuda_stream__ protocol - dev.create_stream(state.get_stream()) - """ - ... - - def addressof(self) -> int: - "Integral value of address of driver's CUDA stream struct" - ... + def __cuda_stream__(self) -> tuple[int, int]: ... + def addressof(self) -> int: ... class Benchmark: - """Represents NVBench benchmark. - - Note - ---- - The class is not user-constructible. - - Use `~register` function to create Benchmark and register - it with NVBench. - """ - def get_name(self) -> str: - "Get benchmark name" - ... - def add_int64_axis(self, name: str, values: Sequence[SupportsInt]) -> Self: - "Add integral type parameter axis with given name and values to sweep over" - ... + def get_name(self) -> str: ... + def add_int64_axis(self, name: str, values: Sequence[SupportsInt]) -> Self: ... def add_int64_power_of_two_axis( self, name: str, values: Sequence[SupportsInt] - ) -> Self: - "Add integral type parameter axis with given name and values to sweep over" - ... - def add_float64_axis(self, name: str, values: Sequence[SupportsFloat]) -> Self: - "Add floating-point type parameter axis with given name and values to sweep over" - ... - def add_string_axis(self, name: str, values: Sequence[str]) -> Self: - "Add string type parameter axis with given name and values to sweep over" - ... - def set_name(self, name: str) -> Self: - "Set benchmark name" - ... - def set_is_cpu_only(self, is_cpu_only: bool) -> Self: - "Set whether this benchmark only executes on CPU" - ... - def set_run_once(self, v: bool) -> Self: - "Set whether all benchmark configurations are executed only once" - ... - def set_skip_time(self, duration_seconds: SupportsFloat) -> Self: - "Set run durations, in seconds, that should be skipped" - ... - def set_throttle_recovery_delay(self, delay_seconds: SupportsFloat) -> Self: - "Set throttle recovery delay, in seconds" - ... - def set_throttle_threshold(self, threshold: SupportsFloat) -> Self: - "Set throttle threshold, as a fraction of maximal GPU frequency" - ... - def set_timeout(self, duration_seconds: SupportsFloat) -> Self: - "Set benchmark run duration timeout value, in seconds" - ... - def set_stopping_criterion(self, criterion: str) -> Self: - "Set stopping criterion to be used" - ... - def set_criterion_param_float64(self, name: str, value: SupportsFloat) -> Self: - "Set stopping criterion floating point parameter value" - ... - def set_criterion_param_int64(self, name: str, value: SupportsInt) -> Self: - "Set stopping criterion integer parameter value" - ... - def set_criterion_param_string(self, name: str, value: str) -> Self: - "Set stopping criterion string parameter value" - ... - def set_min_samples(self, count: SupportsInt) -> Self: - "Set minimal samples count before stopping criterion applies" - ... + ) -> Self: ... + def add_float64_axis(self, name: str, values: Sequence[SupportsFloat]) -> Self: ... + def add_string_axis(self, name: str, values: Sequence[str]) -> Self: ... + def set_name(self, name: str) -> Self: ... + def set_run_once(self, v: bool) -> Self: ... + def set_skip_time(self, duration_seconds: SupportsFloat) -> Self: ... + def set_throttle_recovery_delay(self, delay_seconds: SupportsFloat) -> Self: ... + def set_throttle_threshold(self, threshold: SupportsFloat) -> Self: ... + def set_timeout(self, duration_seconds: SupportsFloat) -> Self: ... + def set_stopping_criterion(self, criterion: str) -> Self: ... + def set_criterion_param_float64(self, name: str, value: SupportsFloat) -> Self: ... + def set_criterion_param_int64(self, name: str, value: SupportsInt) -> Self: ... + def set_criterion_param_string(self, name: str, value: str) -> Self: ... + def set_min_samples(self, count: SupportsInt) -> Self: ... class Launch: - """Configuration object for function launch. - - Note - ---- - The class is not user-constructible. - """ - def get_stream(self) -> CudaStream: - "Get CUDA stream of this configuration" - ... + def get_stream(self) -> CudaStream: ... class State: - """Represent benchmark configuration state. - - Note - ---- - The class is not user-constructible. - """ - def has_device(self) -> bool: - "True if configuration has a device" - ... - def has_printers(self) -> bool: - "True if configuration has a printer" - ... - def get_device(self) -> Union[int, None]: - "Get device_id of the device from this configuration" - ... - def get_stream(self) -> CudaStream: - "CudaStream object from this configuration" - ... - def get_int64(self, name: str) -> int: - "Get value for given Int64 axis from this configuration" - ... - def get_int64_or_default(self, name: str, default_value: SupportsInt) -> int: - "Get value for given Int64 axis from this configuration" - ... - def get_float64(self, name: str) -> float: - "Get value for given Float64 axis from this configuration" - ... - def get_float64_or_default(self, name: str, default_value: SupportsFloat) -> float: - "Get value for given Float64 axis from this configuration" - ... - def get_string(self, name: str) -> str: - "Get value for given String axis from this configuration" - ... - def get_string_or_default(self, name: str, default_value: str) -> str: - "Get value for given String axis from this configuration" - ... + def has_device(self) -> bool: ... + def has_printers(self) -> bool: ... + def get_device(self) -> Union[int, None]: ... + def get_stream(self) -> CudaStream: ... + def get_int64(self, name: str) -> int: ... + def get_int64_or_default(self, name: str, default_value: SupportsInt) -> int: ... + def get_float64(self, name: str) -> float: ... + def get_float64_or_default( + self, name: str, default_value: SupportsFloat + ) -> float: ... + def get_string(self, name: str) -> str: ... + def get_string_or_default(self, name: str, default_value: str) -> str: ... def add_element_count( self, count: SupportsInt, column_name: Optional[str] = None - ) -> None: - "Add element count" - ... - def set_element_count(self, count: SupportsInt) -> None: - "Set element count" - ... - def get_element_count(self) -> int: - "Get element count" - ... - def skip(self, reason: str) -> None: - "Skip this configuration" - ... - def is_skipped(self) -> bool: - "Has this configuration been skipped" - ... - def get_skip_reason(self) -> str: - "Get reason provided for skipping this configuration" - ... + ) -> None: ... + def set_element_count(self, count: SupportsInt) -> None: ... + def get_element_count(self) -> int: ... + def skip(self, reason: str) -> None: ... + def is_skipped(self) -> bool: ... + def get_skip_reason(self) -> str: ... def add_global_memory_reads( self, nbytes: SupportsInt, /, column_name: str = "" - ) -> None: - "Inform NVBench that given amount of bytes is being read by the benchmark from global memory" - ... + ) -> None: ... def add_global_memory_writes( self, nbytes: SupportsInt, /, column_name: str = "" - ) -> None: - "Inform NVBench that given amount of bytes is being written by the benchmark into global memory" - ... - def get_benchmark(self) -> Benchmark: - "Get Benchmark this configuration is a part of" - ... - def get_throttle_threshold(self) -> float: - "Get throttle threshold value, as fraction of maximal frequency" - ... - def set_throttle_threshold(self, threshold_fraction: SupportsFloat) -> None: - "Set throttle threshold fraction to specified value, expected to be between 0 and 1" - ... - def get_min_samples(self) -> int: - "Get the number of benchmark timings NVBench performs before stopping criterion begins being used" - ... - def set_min_samples(self, min_samples_count: SupportsInt) -> None: - "Set the number of benchmark timings for NVBench to perform before stopping criterion begins being used" - ... - def get_disable_blocking_kernel(self) -> bool: - "True if use of blocking kernel by NVBench is disabled, False otherwise" - ... - def set_disable_blocking_kernel(self, flag: bool) -> None: - "Use flag = True to disable use of blocking kernel by NVBench" - ... - def get_run_once(self) -> bool: - "Boolean flag whether configuration should only run once" - ... - def set_run_once(self, run_once_flag: bool) -> None: - "Set run-once flag for this configuration" - ... - def get_timeout(self) -> float: - "Get time-out value for benchmark execution of this configuration, in seconds" - ... - def set_timeout(self, duration: SupportsFloat) -> None: - "Set time-out value for benchmark execution of this configuration, in seconds" - ... - def get_blocking_kernel_timeout(self) -> float: - "Get time-out value for execution of blocking kernel, in seconds" - ... - def set_blocking_kernel_timeout(self, duration: SupportsFloat) -> None: - "Set time-out value for execution of blocking kernel, in seconds" - ... - def collect_cupti_metrics(self) -> None: - "Request NVBench to record CUPTI metrics while running benchmark for this configuration" - ... - def is_cupti_required(self) -> bool: - "True if (some) CUPTI metrics are being collected" - ... + ) -> None: ... + def get_benchmark(self) -> Benchmark: ... + def get_throttle_threshold(self) -> float: ... + def set_throttle_threshold(self, threshold_fraction: SupportsFloat) -> None: ... + def get_min_samples(self) -> int: ... + def set_min_samples(self, min_samples_count: SupportsInt) -> None: ... + def get_disable_blocking_kernel(self) -> bool: ... + def set_disable_blocking_kernel(self, flag: bool) -> None: ... + def get_run_once(self) -> bool: ... + def set_run_once(self, run_once_flag: bool) -> None: ... + def get_timeout(self) -> float: ... + def set_timeout(self, duration: SupportsFloat) -> None: ... + def get_blocking_kernel_timeout(self) -> float: ... + def set_blocking_kernel_timeout(self, duration: SupportsFloat) -> None: ... + def collect_cupti_metrics(self) -> None: ... + def is_cupti_required(self) -> bool: ... def exec( self, fn: Callable[[Launch], None], @@ -253,60 +104,16 @@ class State: *, batched: Optional[bool] = True, sync: Optional[bool] = False, - ): - """Execute callable running the benchmark. - - The callable may be executed multiple times. - - Parameters - ---------- - fn: Callable - Python callable with signature fn(Launch) -> None that executes the benchmark. - batched: bool, optional - If `True`, no cache flushing is performed between callable invocations. - Default: `True`. - sync: bool, optional - True value indicates that callable performs device synchronization. - NVBench disables use of blocking kernel in this case. - Default: `False`. - """ - ... - def get_short_description(self) -> str: - "Get short description for this configuration" - ... + ): ... + def get_short_description(self) -> str: ... def add_summary( self, column_name: str, value: Union[SupportsInt, SupportsFloat, str] - ) -> None: - "Add summary column with a value" - ... - def get_axis_values(self) -> dict[str, int | float | str]: - "Get dictionary with axis values for this configuration" - ... - def get_axis_values_as_string(self) -> str: - "Get string of space-separated name=value pairs for this configuration" - ... - def get_stopping_criterion(self) -> str: - "Get string name of stopping criterion used" - ... - -def register(fn: Callable[[State], None]) -> Benchmark: - """ - Register given benchmarking function with NVBench. - """ - ... - -def run_all_benchmarks(argv: Sequence[str]) -> None: - """ - Run all benchmarks registered with NVBench. - - Parameters - ---------- - argv: List[str] - Sequence of CLI arguments controlling NVBench. Usually, it is `sys.argv`. - """ - ... + ) -> None: ... + def get_axis_values(self) -> dict[str, int | float | str]: ... + def get_axis_values_as_string(self) -> str: ... + def get_stopping_criterion(self) -> str: ... -class NVBenchRuntimeError(RuntimeError): - """An exception raised if running benchmarks encounters an error""" +def register(fn: Callable[[State], None]) -> Benchmark: ... +def run_all_benchmarks(argv: Sequence[str]) -> None: ... - ... +class NVBenchRuntimeError(RuntimeError): ... diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp index 2b09574d..e01a33ce 100644 --- a/python/src/py_nvbench.cpp +++ b/python/src/py_nvbench.cpp @@ -35,8 +35,8 @@ namespace py = pybind11; -// namespace -//{ +namespace +{ struct PyObjectDeleter { @@ -61,8 +61,6 @@ struct PyObjectDeleter } }; -namespace -{ struct benchmark_wrapper_t { @@ -114,10 +112,10 @@ struct benchmark_wrapper_t private: // Important to use shared pointer here rather than py::object directly, - // since copy constructor must be const (benchmark::do_clone is const member method) + // since copy constructor must be const (consequence of benchmark::do_clone + // being const member method) std::shared_ptr m_fn; }; -} // namespace // Use struct to ensure public inheritance struct nvbench_run_error : std::runtime_error @@ -264,58 +262,85 @@ py::dict py_get_axis_values(const nvbench::state &state) // essentially a global variable, but allocated on the heap during module initialization std::unique_ptr global_registry{}; -//} // end of anonymous namespace - -// ========================================== -// PLEASE KEEP IN SYNC WITH __init__.pyi FILE -// ========================================== -// If you modify these bindings, please be sure to update the -// corresponding type hints in ``../cuda/nvbench/__init__.pyi`` - -PYBIND11_MODULE(_nvbench, m) +// Definitions of Python API +static void def_class_CudaStream(py::module_ m) { - // == STEP 1 - // Set environment variable CUDA_MODULE_LOADING=EAGER - - NVBENCH_DRIVER_API_CALL(cuInit(0)); - - // This line ensures that benchmark_manager has been created during module init - // It is reinitialized before running all benchmarks to set devices to use - nvbench::benchmark_manager::get().initialize(); - - // == STEP 2 // Define CudaStream class // ATTN: nvbench::cuda_stream is move-only class // Methods: // Constructors, based on device, or on existing stream // nvbench::cuda_stream::get_stream - auto py_cuda_stream_cls = py::class_(m, "CudaStream"); - - py_cuda_stream_cls.def("__cuda_stream__", - [](const nvbench::cuda_stream &s) -> std::pair { - return std::make_pair(std::size_t{0}, - reinterpret_cast(s.get_stream())); - }); - - py_cuda_stream_cls.def("addressof", [](const nvbench::cuda_stream &s) -> std::size_t { + static constexpr const char *class_CudaStream_doc = R"XXX( +Represents CUDA stream + + Note + ---- + The class is not user-constructible. +)XXX"; + + auto py_cuda_stream_cls = py::class_(m, "CudaStream", class_CudaStream_doc); + + auto method__cuda_stream__impl = + [](const nvbench::cuda_stream &s) -> std::pair { + return std::make_pair(std::size_t{0}, reinterpret_cast(s.get_stream())); + }; + static constexpr const char *method__cuda_stream__doc = R"XXX( + Special method implement CUDA stream protocol + from `cuda.core`. Returns a pair of integers: + (protocol_version, integral_value_of_cudaStream_t pointer) + + Example + ------- + import cuda.core.experimental as core + import cuda.bench as bench + + def bench(state: bench.State): + dev = core.Device(state.get_device()) + dev.set_current() + # converts CudaString to core.Stream + # using __cuda_stream__ protocol + dev.create_stream(state.get_stream()) +)XXX"; + py_cuda_stream_cls.def("__cuda_stream__", method__cuda_stream__impl, method__cuda_stream__doc); + + auto method_addressof_impl = [](const nvbench::cuda_stream &s) -> std::size_t { return reinterpret_cast(s.get_stream()); - }); + }; + static constexpr const char *method_addressof_doc = + R"XXXX(Integral value of address of driver's CUDA stream struct")XXXX"; + py_cuda_stream_cls.def("addressof", method_addressof_impl, method_addressof_doc); +} - // == STEP 3 +void def_class_Launch(py::module_ m) +{ // Define Launch class // ATTN: nvbench::launch is move-only class // Methods: // nvbench::launch::get_stream -> nvbench::cuda_stream - auto py_launch_cls = py::class_(m, "Launch"); - - py_launch_cls.def( - "get_stream", - [](nvbench::launch &launch) { return std::ref(launch.get_stream()); }, - py::return_value_policy::reference); + static constexpr const char *class_Launch_doc = R"XXXX( +Configuration object for function launch. + + Note + ---- + The class is not user-constructible. +)XXXX"; + auto py_launch_cls = py::class_(m, "Launch", class_Launch_doc); + + auto method_get_stream_impl = [](nvbench::launch &launch) { + return std::ref(launch.get_stream()); + }; + static constexpr const char *method_get_stream_doc = + R"XXXX(Get CUDA stream of this configuration)XXXX"; + py_launch_cls.def("get_stream", + method_get_stream_impl, + method_get_stream_doc, + py::return_value_policy::reference); +} - // == STEP 4 +static void def_class_Benchmark(py::module_ m) +{ // Define Benchmark class // ATTN: nvbench::benchmark_base is move-only class // Methods: @@ -336,164 +361,268 @@ PYBIND11_MODULE(_nvbench, m) // nvbench::benchmark_base::set_criterion_param_string // nvbench::benchmark_base::set_min_samples - auto py_benchmark_cls = py::class_(m, "Benchmark"); - py_benchmark_cls.def("get_name", &nvbench::benchmark_base::get_name); + static constexpr const char *class_Benchmark_doc = R"XXXX( +Represents NVBench benchmark. - py_benchmark_cls.def( - "add_int64_axis", - [](nvbench::benchmark_base &self, std::string name, std::vector data) { - self.add_int64_axis(std::move(name), std::move(data)); - return std::ref(self); - }, - py::return_value_policy::reference, - py::arg("name"), - py::arg("values")); + Note + ---- + The class is not user-constructible. - py_benchmark_cls.def( - "add_int64_power_of_two_axis", + Use `~register` function to create Benchmark and register + it with NVBench. +)XXXX"; + auto py_benchmark_cls = py::class_(m, "Benchmark", class_Benchmark_doc); + + // method Benchmark.get_name + auto method_get_name_impl = &nvbench::benchmark_base::get_name; + static constexpr const char *method_get_name_doc = R"XXXX(Get benchmark name)XXXX"; + py_benchmark_cls.def("get_name", method_get_name_impl, method_get_name_doc); + + // method Benchmark.add_int64_axis + auto method_add_int64_axis_impl = [](nvbench::benchmark_base &self, std::string name, std::vector data) { - self.add_int64_axis(std::move(name), - std::move(data), - nvbench::int64_axis_flags::power_of_two); + self.add_int64_axis(std::move(name), std::move(data)); return std::ref(self); - }, - py::return_value_policy::reference, - py::arg("name"), - py::arg("values")); - - py_benchmark_cls.def( - "add_float64_axis", + }; + static constexpr const char *method_add_int64_axis_doc = R"XXXX( +Add integral type parameter axis with given name and values to sweep over +)XXXX"; + py_benchmark_cls.def("add_int64_axis", + method_add_int64_axis_impl, + method_add_int64_axis_doc, + py::return_value_policy::reference, + py::arg("name"), + py::arg("values")); + + // method Benchmark.add_int64_power_of_two_axis + auto method_add_int64_power_of_two_axis_impl = [](nvbench::benchmark_base &self, + std::string name, + std::vector data) { + self.add_int64_axis(std::move(name), std::move(data), nvbench::int64_axis_flags::power_of_two); + return std::ref(self); + }; + static constexpr const char *method_add_int64_power_of_two_axis_doc = R"XXXX( +Add integral type parameter axis with given name and power of two values to sweep over +)XXXX"; + py_benchmark_cls.def("add_int64_power_of_two_axis", + method_add_int64_power_of_two_axis_impl, + method_add_int64_power_of_two_axis_doc, + py::return_value_policy::reference, + py::arg("name"), + py::arg("values")); + + // method Benchmark.add_float64_axis + auto method_add_float64_axis_impl = [](nvbench::benchmark_base &self, std::string name, std::vector data) { self.add_float64_axis(std::move(name), std::move(data)); return std::ref(self); - }, - py::return_value_policy::reference, - py::arg("name"), - py::arg("values")); - - py_benchmark_cls.def( - "add_string_axis", + }; + static constexpr const char *method_add_float64_axis_doc = R"XXXX( +Add floating-point type parameter axis with given name and values to sweep over" +)XXXX"; + py_benchmark_cls.def("add_float64_axis", + method_add_float64_axis_impl, + method_add_float64_axis_doc, + py::return_value_policy::reference, + py::arg("name"), + py::arg("values")); + + // method Benchmark.add_string_axis + auto method_add_string_axis_impl = [](nvbench::benchmark_base &self, std::string name, std::vector data) { self.add_string_axis(std::move(name), std::move(data)); return std::ref(self); - }, - py::return_value_policy::reference, - py::arg("name"), - py::arg("values")); - - py_benchmark_cls.def( - "set_name", - [](nvbench::benchmark_base &self, std::string name) { - self.set_name(std::move(name)); - return std::ref(self); - }, - py::return_value_policy::reference, - py::arg("name")); - - py_benchmark_cls.def( - "set_is_cpu_only", - [](nvbench::benchmark_base &self, bool is_cpu_only) { - self.set_is_cpu_only(is_cpu_only); - return std::ref(self); - }, - py::return_value_policy::reference, - py::arg("is_cpu_only")); - + }; + static constexpr const char *method_add_string_axis_doc = R"XXXX( +Add string type parameter axis with given name and values to sweep over +)XXXX"; + py_benchmark_cls.def("add_string_axis", + method_add_string_axis_impl, + method_add_string_axis_doc, + py::return_value_policy::reference, + py::arg("name"), + py::arg("values")); + + // method Benchmark.set_name + auto method_set_name_impl = [](nvbench::benchmark_base &self, std::string name) { + self.set_name(std::move(name)); + return std::ref(self); + }; + static constexpr const char *method_set_name_doc = R"XXXX(Set benchmark name)XXXX"; + py_benchmark_cls.def("set_name", + method_set_name_impl, + method_set_name_doc, + py::return_value_policy::reference, + py::arg("name")); + + // method Benchmark.set_is_cpu_only + auto method_set_is_cpu_only_impl = [](nvbench::benchmark_base &self, bool is_cpu_only) { + self.set_is_cpu_only(is_cpu_only); + return std::ref(self); + }; + static constexpr const char *method_set_is_cpu_only_doc = + R"XXXX(Set whether this benchmark only executes on CPU)XXXX"; + py_benchmark_cls.def("set_is_cpu_only", + method_set_is_cpu_only_impl, + method_set_is_cpu_only_doc, + py::return_value_policy::reference, + py::arg("is_cpu_only")); + + // method Benchmark.set_run_once + auto method_set_run_once_impl = [](nvbench::benchmark_base &self, bool run_once) { + self.set_run_once(run_once); + return std::ref(self); + }; + static constexpr const char *method_set_run_once_doc = R"XXXX( +Set whether all benchmark configurations are executed only once +)XXXX"; // TODO: should this be exposed? - py_benchmark_cls.def( - "set_run_once", - [](nvbench::benchmark_base &self, bool run_once) { - self.set_run_once(run_once); - return std::ref(self); - }, - py::return_value_policy::reference, - py::arg("run_once")); - - py_benchmark_cls.def( - "set_skip_time", - [](nvbench::benchmark_base &self, nvbench::float64_t skip_duration_seconds) { - self.set_skip_time(skip_duration_seconds); - return std::ref(self); - }, - py::return_value_policy::reference, - py::arg("duration_seconds")); - - py_benchmark_cls.def( - "set_timeout", - [](nvbench::benchmark_base &self, nvbench::float64_t duration_seconds) { - self.set_timeout(duration_seconds); - return std::ref(self); - }, - py::return_value_policy::reference, - py::arg("duration_seconds")); - - py_benchmark_cls.def( - "set_throttle_threshold", - [](nvbench::benchmark_base &self, nvbench::float32_t threshold) { - self.set_throttle_threshold(threshold); - return std::ref(self); - }, - py::return_value_policy::reference, - py::arg("threshold")); - - py_benchmark_cls.def( - "set_throttle_recovery_delay", - [](nvbench::benchmark_base &self, nvbench::float32_t delay) { - self.set_throttle_recovery_delay(delay); - return std::ref(self); - }, - py::return_value_policy::reference, - py::arg("delay_seconds")); - - py_benchmark_cls.def( - "set_stopping_criterion", - [](nvbench::benchmark_base &self, std::string criterion) { - self.set_stopping_criterion(std::move(criterion)); - return std::ref(self); - }, - py::return_value_policy::reference, - py::arg("criterion")); - - py_benchmark_cls.def( - "set_criterion_param_int64", + py_benchmark_cls.def("set_run_once", + method_set_run_once_impl, + method_set_run_once_doc, + py::return_value_policy::reference, + py::arg("run_once")); + + // method Benchmark.set_skip_time + auto method_set_skip_time_impl = [](nvbench::benchmark_base &self, + nvbench::float64_t skip_duration_seconds) { + self.set_skip_time(skip_duration_seconds); + return std::ref(self); + }; + static constexpr const char *method_set_skip_time_doc = R"XXXX( +Set value, in seconds, such that runs with duration shorter than this are skipped +)XXXX"; + py_benchmark_cls.def("set_skip_time", + method_set_skip_time_impl, + method_set_skip_time_doc, + py::return_value_policy::reference, + py::arg("duration_seconds")); + + // method Benchmark.set_timeout + auto method_set_timeout_impl = [](nvbench::benchmark_base &self, + nvbench::float64_t duration_seconds) { + self.set_timeout(duration_seconds); + return std::ref(self); + }; + static constexpr const char *method_set_timeout_doc = R"XXXX( +Set benchmark run duration timeout value, in seconds +)XXXX"; + py_benchmark_cls.def("set_timeout", + method_set_timeout_impl, + method_set_timeout_doc, + py::return_value_policy::reference, + py::arg("duration_seconds")); + + // method Benchmark.set_throttle_threshold + auto method_set_throttle_threshold_impl = [](nvbench::benchmark_base &self, + nvbench::float32_t threshold) { + self.set_throttle_threshold(threshold); + return std::ref(self); + }; + static constexpr const char *method_set_throttle_threshold_doc = R"XXXX( +Set throttle threshold, as a fraction of maximal GPU frequency, in percents +)XXXX"; + py_benchmark_cls.def("set_throttle_threshold", + method_set_throttle_threshold_impl, + method_set_throttle_threshold_doc, + py::return_value_policy::reference, + py::arg("threshold")); + + // method Benchmark.set_throttle_recovery_delay + auto method_set_throttle_recovery_delay_impl = [](nvbench::benchmark_base &self, + nvbench::float32_t delay) { + self.set_throttle_recovery_delay(delay); + return std::ref(self); + }; + static constexpr const char *method_set_throttle_recovery_delay_doc = R"XXXX( +Set throttle recovery delay, in seconds +)XXXX"; + py_benchmark_cls.def("set_throttle_recovery_delay", + method_set_throttle_recovery_delay_impl, + method_set_throttle_recovery_delay_doc, + py::return_value_policy::reference, + py::arg("delay_seconds")); + + // method Benchmark.set_stopping_criterion + auto method_set_stopping_criterion_impl = [](nvbench::benchmark_base &self, + std::string criterion) { + self.set_stopping_criterion(std::move(criterion)); + return std::ref(self); + }; + static constexpr const char *method_set_stopping_criterion_doc = R"XXXX( +Set stopping criterion to be used +)XXXX"; + py_benchmark_cls.def("set_stopping_criterion", + method_set_stopping_criterion_impl, + method_set_stopping_criterion_doc, + py::return_value_policy::reference, + py::arg("criterion")); + + // method Benchmark.set_criterion_param_int64 + auto method_set_criterion_param_int64_impl = [](nvbench::benchmark_base &self, std::string name, nvbench::int64_t value) { self.set_criterion_param_int64(std::move(name), value); return std::ref(self); - }, - py::return_value_policy::reference, - py::arg("name"), - py::arg("value")); - - py_benchmark_cls.def( - "set_criterion_param_float64", + }; + static constexpr const char *method_set_criterion_param_int64_doc = R"XXXX( +Set stopping criterion integer parameter value +)XXXX"; + py_benchmark_cls.def("set_criterion_param_int64", + method_set_criterion_param_int64_impl, + method_set_criterion_param_int64_doc, + py::return_value_policy::reference, + py::arg("name"), + py::arg("value")); + + // method Benchmark.set_criterion_param_float64 + auto method_set_criterion_param_float64_impl = [](nvbench::benchmark_base &self, std::string name, nvbench::float64_t value) { self.set_criterion_param_float64(std::move(name), value); return std::ref(self); - }, - py::return_value_policy::reference, - py::arg("name"), - py::arg("value")); - - py_benchmark_cls.def( - "set_criterion_param_string", + }; + static constexpr const char *method_set_criterion_param_float64_doc = R"XXXX( +Set stopping criterion floating point parameter value" +)XXXX"; + py_benchmark_cls.def("set_criterion_param_float64", + method_set_criterion_param_float64_impl, + method_set_criterion_param_float64_doc, + py::return_value_policy::reference, + py::arg("name"), + py::arg("value")); + + // method Benchmark.set_criterion_param_string + auto method_set_criterion_param_string_impl = [](nvbench::benchmark_base &self, std::string name, std::string value) { self.set_criterion_param_string(std::move(name), std::move(value)); return std::ref(self); - }, - py::return_value_policy::reference, - py::arg("name"), - py::arg("value")); - - py_benchmark_cls.def( - "set_min_samples", - [](nvbench::benchmark_base &self, nvbench::int64_t count) { - self.set_min_samples(count); - return std::ref(self); - }, - py::return_value_policy::reference, - py::arg("min_samples_count")); + }; + static constexpr const char *method_set_criterion_param_string_doc = R"XXXX( +Set stopping criterion string parameter value +)XXXX"; + py_benchmark_cls.def("set_criterion_param_string", + method_set_criterion_param_string_impl, + method_set_criterion_param_string_doc, + py::return_value_policy::reference, + py::arg("name"), + py::arg("value")); + + // method Benchmark.set_min_samples + auto method_set_min_samples_impl = [](nvbench::benchmark_base &self, nvbench::int64_t count) { + self.set_min_samples(count); + return std::ref(self); + }; + static constexpr const char *method_set_min_samples_doc = R"XXXX( +Set minimal samples count before stopping criterion applies +)XXXX"; + py_benchmark_cls.def("set_min_samples", + method_set_min_samples_impl, + method_set_min_samples_doc, + py::return_value_policy::reference, + py::arg("min_samples_count")); +} - // == STEP 5 +void def_class_State(py::module_ m) +{ // Define PyState class // ATTN: nvbench::state is move-only class // Methods: @@ -561,244 +690,520 @@ PYBIND11_MODULE(_nvbench, m) // NOTE: // State wraps std::reference_wrapper - using state_ref_t = std::reference_wrapper; - auto pystate_cls = py::class_(m, "State"); + using state_ref_t = std::reference_wrapper; + static constexpr const char *class_State_doc = R"XXXX( +Represent benchmark configuration state. - pystate_cls.def("has_device", [](const nvbench::state &state) -> bool { - return static_cast(state.get_device()); - }); + Note + ---- + The class is not user-constructible. +)XXXX"; + auto pystate_cls = py::class_(m, "State", class_State_doc); - pystate_cls.def("has_printers", [](const nvbench::state &state) -> bool { + // method State.has_device + auto method_has_device_impl = [](const nvbench::state &state) -> bool { + return static_cast(state.get_device()); + }; + static constexpr const char *method_has_device_doc = R"XXXX( +Returns True if configuration has a device +)XXXX"; + pystate_cls.def("has_device", method_has_device_impl, method_has_device_doc); + + // method State.has_printers + auto method_has_printers_impl = [](const nvbench::state &state) -> bool { return state.get_benchmark().get_printer().has_value(); - }); - - pystate_cls.def("get_device", [](const nvbench::state &state) { + }; + static constexpr const char *method_has_printers_doc = R"XXXX( +Returns True if configuration has a printer" +)XXXX"; + pystate_cls.def("has_printers", method_has_printers_impl, method_has_printers_doc); + + // method State.get_device + auto method_get_device_impl = [](const nvbench::state &state) { auto dev = state.get_device(); if (dev.has_value()) { return py::cast(dev.value().get_id()); } return py::object(py::none()); - }); - - pystate_cls.def( - "get_stream", - [](nvbench::state &state) { return std::ref(state.get_cuda_stream()); }, - py::return_value_policy::reference); - - pystate_cls.def("get_int64", &nvbench::state::get_int64, py::arg("name")); + }; + static constexpr const char *method_get_device_doc = R"XXXX( +Get device_id of the device from this configuration +)XXXX"; + pystate_cls.def("get_device", method_get_device_impl, method_get_device_doc); + + // method State.get_stream + auto method_get_stream_impl = [](nvbench::state &state) { + return std::ref(state.get_cuda_stream()); + }; + static constexpr const char *method_get_stream_doc = R"XXXX( +Get `~CudaStream` object from this configuration" +)XXXX"; + pystate_cls.def("get_stream", + method_get_stream_impl, + method_get_stream_doc, + py::return_value_policy::reference); + + // method State.get_int64 + auto method_get_int64_impl = &nvbench::state::get_int64; + static constexpr const char *method_get_int64_doc = R"XXXX( +Get value for given Int64 axis from this configuration +)XXXX"; + pystate_cls.def("get_int64", method_get_int64_impl, method_get_int64_doc, py::arg("name")); + + // method State.get_int64_or_default + auto method_get_int64_or_default_impl = &nvbench::state::get_int64_or_default; + static constexpr const char *method_get_int64_or_default_doc = method_get_int64_doc; pystate_cls.def("get_int64_or_default", - &nvbench::state::get_int64_or_default, + method_get_int64_or_default_impl, + method_get_int64_or_default_doc, py::arg("name"), py::pos_only{}, py::arg("default_value")); - pystate_cls.def("get_float64", &nvbench::state::get_float64, py::arg("name")); + // method State.get_float64 + auto method_get_float64_impl = &nvbench::state::get_float64; + static constexpr const char *method_get_float64_doc = R"XXXX( +Get value for given Float64 axis from this configuration +)XXXX"; + pystate_cls.def("get_float64", method_get_float64_impl, method_get_float64_doc, py::arg("name")); + + // method State.get_float64_or_default + static constexpr const char *method_get_float64_or_default_doc = method_get_float64_doc; pystate_cls.def("get_float64_or_default", &nvbench::state::get_float64_or_default, + method_get_float64_or_default_doc, py::arg("name"), py::pos_only{}, py::arg("default_value")); - pystate_cls.def("get_string", &nvbench::state::get_string, py::arg("name")); + // method State.get_string + static constexpr const char *method_get_string_doc = R"XXXX( +Get value for given String axis from this configuration +)XXXX"; + pystate_cls.def("get_string", &nvbench::state::get_string, method_get_string_doc, py::arg("name")); + + // method State.get_string_or_default + static constexpr const char *method_get_string_or_default_doc = method_get_string_doc; pystate_cls.def("get_string_or_default", &nvbench::state::get_string_or_default, + method_get_string_or_default_doc, py::arg("name"), py::pos_only{}, py::arg("default_value")); + // method State.get_element_count + static constexpr const char *method_add_element_count_doc = R"XXXX( +Add element count" +)XXXX"; pystate_cls.def("add_element_count", &nvbench::state::add_element_count, + method_add_element_count_doc, py::arg("count"), py::arg("column_name") = py::str("")); - pystate_cls.def("set_element_count", &nvbench::state::set_element_count, py::arg("count")); - pystate_cls.def("get_element_count", &nvbench::state::get_element_count); - + // method State.set_element_count + static constexpr const char *method_set_element_count_doc = R"XXXX( +Set element count +)XXXX"; + pystate_cls.def("set_element_count", + &nvbench::state::set_element_count, + method_set_element_count_doc, + py::arg("count")); + + // method State.get_element_count + static constexpr const char *method_get_element_count = R"XXXX( +Get element count +)XXXX"; + pystate_cls.def("get_element_count", + &nvbench::state::get_element_count, + method_get_element_count); + + // method State.skip + static constexpr const char *method_skip_doc = "Skip this configuration"; pystate_cls.def("skip", &nvbench::state::skip, py::arg("reason")); - pystate_cls.def("is_skipped", &nvbench::state::is_skipped); - pystate_cls.def("get_skip_reason", &nvbench::state::get_skip_reason); - pystate_cls.def( - "add_global_memory_reads", + // method State.is_skipped + static constexpr const char *method_is_skipped_doc = R"XXXX( +Returns True if this configuration is being skipped"; +)XXXX"; + pystate_cls.def("is_skipped", &nvbench::state::is_skipped, method_is_skipped_doc); + + // method State.get_skip_reason + static constexpr const char *method_get_skip_reason_doc = R"XXXX( +Get reason provided for skipping this configuration +)XXXX"; + pystate_cls.def("get_skip_reason", &nvbench::state::get_skip_reason, method_get_skip_reason_doc); + + // method State.add_global_memory_reads + auto method_add_global_memory_reads_impl = [](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void { - state.add_global_memory_reads(nbytes, column_name); - }, - "Add size, in bytes, of global memory reads", - py::arg("nbytes"), - py::pos_only{}, - py::arg("column_name") = py::str("")); - - pystate_cls.def( - "add_global_memory_writes", + state.add_global_memory_reads(nbytes, column_name); + }; + static constexpr const char *method_add_global_memory_reads_doc = R"XXXX( +Inform NVBench that given amount of bytes is being read by the benchmark from global memory +)XXXX"; + pystate_cls.def("add_global_memory_reads", + method_add_global_memory_reads_impl, + method_add_global_memory_reads_doc, + py::arg("nbytes"), + py::pos_only{}, + py::arg("column_name") = py::str("")); + + // method State.add_global_memory_writes + auto method_add_global_memory_writes_impl = [](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void { - state.add_global_memory_writes(nbytes, column_name); - }, - "Add size, in bytes, of global memory writes", - py::arg("nbytes"), - py::pos_only{}, - py::arg("column_name") = py::str("")); - - pystate_cls.def( - "get_benchmark", - [](const nvbench::state &state) { return std::ref(state.get_benchmark()); }, - py::return_value_policy::reference); - - pystate_cls.def("get_throttle_threshold", &nvbench::state::get_throttle_threshold); + state.add_global_memory_writes(nbytes, column_name); + }; + static constexpr const char *method_add_global_memory_writes_doc = R"XXXX( +Inform NVBench that given amount of bytes is being written by the benchmark into global memory +)XXXX"; + pystate_cls.def("add_global_memory_writes", + method_add_global_memory_writes_impl, + method_add_global_memory_writes_doc, + py::arg("nbytes"), + py::pos_only{}, + py::arg("column_name") = py::str("")); + + // method State.get_benchmark + auto method_get_benchmark_impl = [](const nvbench::state &state) { + return std::ref(state.get_benchmark()); + }; + static constexpr const char *method_get_benchmark_doc = R"XXXX( +Get Benchmark this configuration is a part of +)XXXX"; + pystate_cls.def("get_benchmark", + method_get_benchmark_impl, + method_get_benchmark_doc, + py::return_value_policy::reference); + + // method State.get_throttle_threshold + static constexpr const char *method_get_throttle_threshold_doc = R"XXXX( +Get throttle threshold value, as fraction of maximal frequency. + +Note +---- + A valid threshold value is between 0 and 1. +)XXXX"; + pystate_cls.def("get_throttle_threshold", + &nvbench::state::get_throttle_threshold, + method_get_throttle_threshold_doc); + + // method State.set_throttle_threshold + static constexpr const char *method_set_throttle_threshold_doc = R"XXXX( +Set throttle threshold fraction to the specified value, expected to be between 0 and 1" +)XXXX"; pystate_cls.def("set_throttle_threshold", &nvbench::state::set_throttle_threshold, + method_set_throttle_threshold_doc, py::arg("throttle_fraction")); - pystate_cls.def("get_min_samples", &nvbench::state::get_min_samples); + // method State.get_min_samples + static constexpr const char *method_get_min_samples_doc = R"XXXX( +Get the number of benchmark timings NVBench performs before stopping criterion begins being used +)XXXX"; + pystate_cls.def("get_min_samples", &nvbench::state::get_min_samples, method_get_min_samples_doc); + + // method State.set_min_samples + static constexpr const char *method_set_min_samples_doc = R"XXXX( +Set the number of benchmark timings for NVBench to perform before stopping criterion begins being used +)XXXX"; pystate_cls.def("set_min_samples", &nvbench::state::set_min_samples, + method_set_min_samples_doc, py::arg("min_samples_count")); - pystate_cls.def("get_disable_blocking_kernel", &nvbench::state::get_disable_blocking_kernel); - + // method State.get_disable_blocking_kernel + static constexpr const char *method_get_disable_blocking_kernel_doc = R"XXXX( +Return True if use of blocking kernel by NVBench is disabled, False otherwise +)XXXX"; + pystate_cls.def("get_disable_blocking_kernel", + &nvbench::state::get_disable_blocking_kernel, + method_get_disable_blocking_kernel_doc); + + // method State.set_disable_blocking_kernel + static constexpr const char *method_set_disable_blocking_kernel_doc = R"XXXX( +Use argument True to disable use of blocking kernel by NVBench" +)XXXX"; pystate_cls.def("set_disable_blocking_kernel", &nvbench::state::set_disable_blocking_kernel, + method_set_disable_blocking_kernel_doc, py::arg("disable_blocking_kernel")); - pystate_cls.def("get_run_once", &nvbench::state::get_run_once); - - pystate_cls.def("set_run_once", &nvbench::state::set_run_once, py::arg("run_once")); - - pystate_cls.def("get_timeout", &nvbench::state::get_timeout); - - pystate_cls.def("set_timeout", &nvbench::state::set_timeout, py::arg("duration")); - - pystate_cls.def("get_blocking_kernel_timeout", &nvbench::state::get_blocking_kernel_timeout); - + // method State.get_run_once + static constexpr const char *method_get_run_once_doc = + R"XXXX(Boolean flag indicating whether configuration should only run once)XXXX"; + pystate_cls.def("get_run_once", &nvbench::state::get_run_once, method_get_run_once_doc); + + // method State.set_run_once + static constexpr const char *method_set_run_once_doc = + R"XXXX(Set run-once flag for this configuration)XXXX"; + pystate_cls.def("set_run_once", + &nvbench::state::set_run_once, + method_set_run_once_doc, + py::arg("run_once")); + + // method State.get_timeout + static constexpr const char *method_get_timeout_doc = + R"XXXX(Get time-out value for benchmark execution of this configuration, in seconds)XXXX"; + pystate_cls.def("get_timeout", &nvbench::state::get_timeout, method_get_timeout_doc); + + // method State.set_timeout + static constexpr const char *method_set_timeout_doc = + R"XXXX(Set time-out value for benchmark execution of this configuration, in seconds)XXXX"; + pystate_cls.def("set_timeout", + &nvbench::state::set_timeout, + method_set_timeout_doc, + py::arg("duration_seconds")); + + // method State.get_blocking_kernel_timeout + static constexpr const char *method_get_blocking_kernel_timeout_doc = + R"XXXX(Get time-out value for execution of blocking kernel, in seconds)XXXX"; + pystate_cls.def("get_blocking_kernel_timeout", + &nvbench::state::get_blocking_kernel_timeout, + method_get_blocking_kernel_timeout_doc); + + // method State.set_blocking_kernel_timeout + static constexpr const char *method_set_blocking_kernel_timeout_doc = + R"XXXX(Set time-out value for execution of blocking kernel, in seconds)XXXX"; pystate_cls.def("set_blocking_kernel_timeout", &nvbench::state::set_blocking_kernel_timeout, - py::arg("duration")); - - pystate_cls.def("collect_cupti_metrics", &nvbench::state::collect_cupti_metrics); + method_set_blocking_kernel_timeout_doc, + py::arg("duration_seconds")); + + // method State.collect_cupti_metrics + static constexpr const char *method_collect_cupti_metrics_doc = + R"XXXX(Request NVBench to record CUPTI metrics while running benchmark for this configuration)XXXX"; + pystate_cls.def("collect_cupti_metrics", + &nvbench::state::collect_cupti_metrics, + method_collect_cupti_metrics_doc); + + // method State.is_cupti_required + static constexpr const char *method_is_cupti_required_doc = + R"XXXX(True if (some) CUPTI metrics are being collected)XXXX"; + pystate_cls.def("is_cupti_required", + &nvbench::state::is_cupti_required, + method_is_cupti_required_doc); + + // method State.exec + auto method_exec_impl = + [](nvbench::state &state, py::object py_launcher_fn, bool batched, bool sync) -> void { + if (!PyCallable_Check(py_launcher_fn.ptr())) + { + throw py::type_error("Argument of exec method must be a callable object"); + } - pystate_cls.def("is_cupti_required", &nvbench::state::is_cupti_required); + // wrapper to invoke Python callable + auto cpp_launcher_fn = [py_launcher_fn](nvbench::launch &launch_descr) -> void { + // cast C++ object to python object + auto launch_pyarg = py::cast(std::ref(launch_descr), py::return_value_policy::reference); + // call Python callable + py_launcher_fn(launch_pyarg); + }; - pystate_cls.def( - "exec", - [](nvbench::state &state, py::object py_launcher_fn, bool batched, bool sync) { - if (!PyCallable_Check(py_launcher_fn.ptr())) + if (sync) + { + if (batched) { - throw py::type_error("Argument of exec method must be a callable object"); + constexpr auto tag = nvbench::exec_tag::sync; + state.exec(tag, cpp_launcher_fn); } - - // wrapper to invoke Python callable - auto cpp_launcher_fn = [py_launcher_fn](nvbench::launch &launch_descr) -> void { - // cast C++ object to python object - auto launch_pyarg = py::cast(std::ref(launch_descr), py::return_value_policy::reference); - // call Python callable - py_launcher_fn(launch_pyarg); - }; - - if (sync) + else { - if (batched) - { - constexpr auto tag = nvbench::exec_tag::sync; - state.exec(tag, cpp_launcher_fn); - } - else - { - constexpr auto tag = nvbench::exec_tag::sync | nvbench::exec_tag::no_batch; - state.exec(tag, cpp_launcher_fn); - } + constexpr auto tag = nvbench::exec_tag::sync | nvbench::exec_tag::no_batch; + state.exec(tag, cpp_launcher_fn); + } + } + else + { + if (batched) + { + constexpr auto tag = nvbench::exec_tag::none; + state.exec(tag, cpp_launcher_fn); } else { - if (batched) - { - constexpr auto tag = nvbench::exec_tag::none; - state.exec(tag, cpp_launcher_fn); - } - else - { - constexpr auto tag = nvbench::exec_tag::no_batch; - state.exec(tag, cpp_launcher_fn); - } + constexpr auto tag = nvbench::exec_tag::no_batch; + state.exec(tag, cpp_launcher_fn); } - }, - "Executor for given launcher callable fn(state : Launch)", - py::arg("launcher_fn"), - py::pos_only{}, - py::arg("batched") = true, - py::arg("sync") = false); + } + }; + static constexpr const char *method_exec_doc = R"XXXX( +Execute callable running the benchmark. + + The callable may be executed multiple times. The callable + will be passed `~Launch` object argument. + + Parameters + ---------- + fn: Callable + Python callable with signature fn(Launch) -> None that executes the benchmark. + batched: bool, optional + If `True`, no cache flushing is performed between callable invocations. + Default: `True`. + sync: bool, optional + True value indicates that callable performs device synchronization. + NVBench disables use of blocking kernel in this case. + Default: `False`. + +)XXXX"; + pystate_cls.def("exec", + method_exec_impl, + method_exec_doc, + py::arg("launcher_fn"), + py::pos_only{}, + py::arg("batched") = true, + py::arg("sync") = false); + // method State.get_short_description + static constexpr const char *method_get_short_description_doc = R"XXXX( +Get short description for this configuration +)XXXX"; pystate_cls.def("get_short_description", - [](const nvbench::state &state) { return state.get_short_description(); }); + &nvbench::state::get_short_description, + method_get_short_description_doc); - pystate_cls.def( - "add_summary", + // method State.add_summary + auto method_add_summary_string_value_impl = [](nvbench::state &state, std::string column_name, std::string value) { auto &summ = state.add_summary("nv/python/" + column_name); summ.set_string("description", "User tag: " + column_name); summ.set_string("name", std::move(column_name)); summ.set_string("value", std::move(value)); - }, - py::arg("name"), - py::arg("value")); + }; + static constexpr const char *method_add_summary_doc = R"XXXX( +Add summary column with given name and value +)XXXX"; + pystate_cls.def("add_summary", + method_add_summary_string_value_impl, + method_add_summary_doc, + py::arg("name"), + py::arg("value")); - pystate_cls.def( - "add_summary", - [](nvbench::state &state, std::string column_name, std::int64_t value) { + auto method_add_summary_int64_value_impl = + [](nvbench::state &state, std::string column_name, nvbench::int64_t value) { auto &summ = state.add_summary("nv/python/" + column_name); summ.set_string("description", "User tag: " + column_name); summ.set_string("name", std::move(column_name)); summ.set_int64("value", value); - }, - py::arg("name"), - py::arg("value")); + }; + pystate_cls.def("add_summary", + method_add_summary_int64_value_impl, + method_add_summary_doc, + py::arg("name"), + py::arg("value")); - pystate_cls.def( - "add_summary", - [](nvbench::state &state, std::string column_name, double value) { + auto method_add_summary_float64_value_impl = + [](nvbench::state &state, std::string column_name, nvbench::float64_t value) { auto &summ = state.add_summary("nv/python/" + column_name); summ.set_string("description", "User tag: " + column_name); summ.set_string("name", std::move(column_name)); summ.set_float64("value", value); - }, - py::arg("name"), - py::arg("value")); + }; + pystate_cls.def("add_summary", + method_add_summary_float64_value_impl, + method_add_summary_doc, + py::arg("name"), + py::arg("value")); + // method State.get_axis_values_as_string + static constexpr const char *method_get_axis_values_as_string_doc = R"XXXX( +Get string of space-separated name=value pairs for this configuration +)XXXX"; pystate_cls.def("get_axis_values_as_string", - [](const nvbench::state &state) { return state.get_axis_values_as_string(); }); - pystate_cls.def("get_axis_values", &py_get_axis_values); - pystate_cls.def("get_stopping_criterion", &nvbench::state::get_stopping_criterion); + &nvbench::state::get_axis_values_as_string, + method_get_axis_values_as_string_doc); + + // method State.get_axis_values + static constexpr const char *method_get_axis_values_doc = R"XXXX( +Get dictionary with axis values for this configuration +)XXXX"; + pystate_cls.def("get_axis_values", &py_get_axis_values, method_get_axis_values_doc); + + // method State.get_stopping_criterion + static constexpr const char *method_get_stopping_criterion_doc = R"XXXX( +Get string name of the stopping criterion used +)XXXX"; + pystate_cls.def("get_stopping_criterion", + &nvbench::state::get_stopping_criterion, + method_get_stopping_criterion_doc); +} + +} // namespace + +// ========================================== +// PLEASE KEEP IN SYNC WITH __init__.pyi FILE +// ========================================== +// If you modify these bindings, please be sure to update the +// corresponding type hints in ``../cuda/nvbench/__init__.pyi`` + +PYBIND11_MODULE(_nvbench, m) +{ + NVBENCH_DRIVER_API_CALL(cuInit(0)); + + // This line ensures that benchmark_manager has been created during module init + // It is reinitialized before running all benchmarks to set devices to use + nvbench::benchmark_manager::get().initialize(); + + def_class_CudaStream(m); + + def_class_Launch(m); + + def_class_Benchmark(m); + + def_class_State(m); // Use handle to take a memory leak here, since this object's destructor may be called after // interpreter has shut down - benchmark_exc = - py::exception(m, "NVBenchRuntimeError", PyExc_RuntimeError).release(); - // == STEP 6 - // ATTN: nvbench::benchmark_manager is a singleton + static constexpr const char *exception_nvbench_runtime_error_doc = R"XXXX( +An exception raised if running benchmarks encounters an error +)XXXX"; + py::object benchmark_exc_ = + py::exception(m, "NVBenchRuntimeError", PyExc_RuntimeError); + benchmark_exc_.attr("__doc__") = exception_nvbench_runtime_error_doc; + + benchmark_exc = benchmark_exc_.release(); + // ATTN: nvbench::benchmark_manager is a singleton, it is exposed through + // GlobalBenchmarkRegistry class global_registry = std::unique_ptr(new GlobalBenchmarkRegistry(), py::nodelete{}); - m.def( - "register", - [&](py::object fn) { return std::ref(global_registry->add_bench(fn)); }, - "Register benchmark function of type Callable[[nvbench.State], None]", - py::return_value_policy::reference, - py::arg("benchmark_fn")); - - m.def( - "run_all_benchmarks", - [&](py::object argv) -> void { - if (!py::isinstance(argv)) - { - throw py::type_error("run_all_benchmarks expects a list of command-line arguments"); - } - std::vector args = py::cast>(argv); - global_registry->run(args); - }, - "Run all registered benchmarks", - py::arg("argv") = py::list()); - + // function register + auto func_register_impl = [&](py::object fn) { return std::ref(global_registry->add_bench(fn)); }; + static constexpr const char *func_register_doc = R"XXXX( +Register benchmark function of type Callable[[nvbench.State], None] +)XXXX"; + m.def("register", + func_register_impl, + func_register_doc, + py::return_value_policy::reference, + py::arg("benchmark_fn")); + + // function run_all_benchmarks + auto func_run_all_benchmarks_impl = [&](py::object argv) -> void { + if (!py::isinstance(argv)) + { + throw py::type_error("run_all_benchmarks expects a list of command-line arguments"); + } + std::vector args = py::cast>(argv); + global_registry->run(args); + }; + static constexpr const char *func_run_all_benchmarks_doc = R"XXXX( + Run all benchmarks registered with NVBench. + + Parameters + ---------- + argv: List[str] + Sequence of CLI arguments controlling NVBench. Usually, it is `sys.argv`. +)XXXX"; + m.def("run_all_benchmarks", + func_run_all_benchmarks_impl, + func_run_all_benchmarks_doc, + py::arg("argv") = py::list()); + + // Testing utilities m.def("test_cpp_exception", []() { throw nvbench_run_error("Test"); }); m.def("test_py_exception", []() { py::set_error(benchmark_exc, "Test"); diff --git a/python/test/test_nvbench.py b/python/test/test_nvbench.py index 5604a3f2..7d927e8f 100644 --- a/python/test/test_nvbench.py +++ b/python/test/test_nvbench.py @@ -37,3 +37,54 @@ def test_cpu_only(): b.set_is_cpu_only(True) bench.run_all_benchmarks(["-q", "--profile"]) + + +def docstring_check(doc_str: str) -> None: + assert isinstance(doc_str, str) + assert len(doc_str) > 0 + + +def obj_has_docstring_check(o: object) -> None: + docstring_check(o.__doc__) + + +def test_module_doc(): + obj_has_docstring_check(bench) + + +def test_register_doc(): + obj_has_docstring_check(bench.register) + + +def test_run_all_benchmarks_doc(): + obj_has_docstring_check(bench.run_all_benchmarks) + + +def test_State_doc(): + cl = bench.State + obj_has_docstring_check(cl) + obj_has_docstring_check(cl.exec) + obj_has_docstring_check(cl.get_int64) + obj_has_docstring_check(cl.get_float64) + obj_has_docstring_check(cl.get_string) + obj_has_docstring_check(cl.skip) + + +def test_Launch_doc(): + cl = bench.Launch + obj_has_docstring_check(cl) + obj_has_docstring_check(cl.get_stream) + + +def test_CudaStream_doc(): + cl = bench.CudaStream + obj_has_docstring_check(cl) + + +def test_Benchmark_doc(): + cl = bench.Benchmark + obj_has_docstring_check(cl) + obj_has_docstring_check(cl.add_int64_axis) + obj_has_docstring_check(cl.add_int64_power_of_two_axis) + obj_has_docstring_check(cl.add_float64_axis) + obj_has_docstring_check(cl.add_string_axis) From 6a8bac520e069e804cda1a6122d23044819e51ea Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Tue, 9 Dec 2025 14:02:42 -0600 Subject: [PATCH 2/2] Replace use of py::handle to store global_registry Use py::gil_safe_call_once_and_store facility pybind11 provides. --- python/src/py_nvbench.cpp | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp index e01a33ce..39667df4 100644 --- a/python/src/py_nvbench.cpp +++ b/python/src/py_nvbench.cpp @@ -124,7 +124,8 @@ struct nvbench_run_error : std::runtime_error // that are defined for the base class using std::runtime_error::runtime_error; }; -py::handle benchmark_exc{}; + +PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store exc_storage; void run_interruptible(nvbench::option_parser &parser) { @@ -223,18 +224,18 @@ class GlobalBenchmarkRegistry } catch (py::error_already_set &e) { - py::raise_from(e, benchmark_exc.ptr(), "Python error raised "); + py::raise_from(e, exc_storage.get_stored().ptr(), "Python error raised "); throw py::error_already_set(); } catch (const std::exception &e) { const std::string &exc_message = e.what(); - py::set_error(benchmark_exc, exc_message.c_str()); + py::set_error(exc_storage.get_stored(), exc_message.c_str()); throw py::error_already_set(); } catch (...) { - py::set_error(benchmark_exc, "Caught unknown exception in nvbench_main"); + py::set_error(exc_storage.get_stored(), "Caught unknown exception in nvbench_main"); throw py::error_already_set(); } } @@ -1158,11 +1159,12 @@ PYBIND11_MODULE(_nvbench, m) static constexpr const char *exception_nvbench_runtime_error_doc = R"XXXX( An exception raised if running benchmarks encounters an error )XXXX"; - py::object benchmark_exc_ = - py::exception(m, "NVBenchRuntimeError", PyExc_RuntimeError); - benchmark_exc_.attr("__doc__") = exception_nvbench_runtime_error_doc; - - benchmark_exc = benchmark_exc_.release(); + exc_storage.call_once_and_store_result([&]() { + py::object benchmark_exc_ = + py::exception(m, "NVBenchRuntimeError", PyExc_RuntimeError); + benchmark_exc_.attr("__doc__") = exception_nvbench_runtime_error_doc; + return benchmark_exc_; + }); // ATTN: nvbench::benchmark_manager is a singleton, it is exposed through // GlobalBenchmarkRegistry class @@ -1171,7 +1173,7 @@ An exception raised if running benchmarks encounters an error py::nodelete{}); // function register - auto func_register_impl = [&](py::object fn) { return std::ref(global_registry->add_bench(fn)); }; + auto func_register_impl = [](py::object fn) { return std::ref(global_registry->add_bench(fn)); }; static constexpr const char *func_register_doc = R"XXXX( Register benchmark function of type Callable[[nvbench.State], None] )XXXX"; @@ -1206,7 +1208,7 @@ Register benchmark function of type Callable[[nvbench.State], None] // Testing utilities m.def("test_cpp_exception", []() { throw nvbench_run_error("Test"); }); m.def("test_py_exception", []() { - py::set_error(benchmark_exc, "Test"); + py::set_error(exc_storage.get_stored(), "Test"); throw py::error_already_set(); }); }