From 0f3045bb2c4899ea4233b68a0a666b8ae8e18c74 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 9 Dec 2025 09:39:10 -0600
Subject: [PATCH 1/2] Move docstrings from PYI file to implementation

Added tests that docstrings exist and are not empty.

This closes #291
---
 python/cuda/bench/__init__.py  |    4 +
 python/cuda/bench/__init__.pyi |  319 ++--------
 python/src/py_nvbench.cpp      | 1069 ++++++++++++++++++++++----------
 python/test/test_nvbench.py    |   51 ++
 4 files changed, 855 insertions(+), 588 deletions(-)

diff --git a/python/cuda/bench/__init__.py b/python/cuda/bench/__init__.py
index e1d2282a..8444b501 100644
--- a/python/cuda/bench/__init__.py
+++ b/python/cuda/bench/__init__.py
@@ -57,3 +57,7 @@
 )
 
 del load_nvidia_dynamic_lib
+
+__doc__ = """
+CUDA Kernel Benchmarking Library Python API
+"""
diff --git a/python/cuda/bench/__init__.pyi b/python/cuda/bench/__init__.pyi
index 86681fc4..76ebcdc4 100644
--- a/python/cuda/bench/__init__.pyi
+++ b/python/cuda/bench/__init__.pyi
@@ -29,223 +29,74 @@ from collections.abc import Callable, Sequence
 from typing import Optional, Self, SupportsFloat, SupportsInt, Union
 
 class CudaStream:
-    """Represents CUDA stream
-
-    Note
-    ----
-        The class is not user-constructible.
-    """
-    def __cuda_stream__(self) -> tuple[int, int]:
-        """
-        Special method implement CUDA stream protocol
-        from `cuda.core`. Returns a pair of integers:
-        (protocol_version, integral_value_of_cudaStream_t pointer)
-
-        Example
-        -------
-            import cuda.core.experimental as core
-            import cuda.bench as bench
-
-            def bench(state: bench.State):
-                dev = core.Device(state.get_device())
-                dev.set_current()
-                # converts CudaString to core.Stream
-                # using __cuda_stream__ protocol
-                dev.create_stream(state.get_stream())
-        """
-        ...
-
-    def addressof(self) -> int:
-        "Integral value of address of driver's CUDA stream struct"
-        ...
+    def __cuda_stream__(self) -> tuple[int, int]: ...
+    def addressof(self) -> int: ...
 
 class Benchmark:
-    """Represents NVBench benchmark.
-
-    Note
-    ----
-        The class is not user-constructible.
-
-        Use `~register` function to create Benchmark and register
-        it with NVBench.
-    """
-    def get_name(self) -> str:
-        "Get benchmark name"
-        ...
-    def add_int64_axis(self, name: str, values: Sequence[SupportsInt]) -> Self:
-        "Add integral type parameter axis with given name and values to sweep over"
-        ...
+    def get_name(self) -> str: ...
+    def add_int64_axis(self, name: str, values: Sequence[SupportsInt]) -> Self: ...
     def add_int64_power_of_two_axis(
         self, name: str, values: Sequence[SupportsInt]
-    ) -> Self:
-        "Add integral type parameter axis with given name and values to sweep over"
-        ...
-    def add_float64_axis(self, name: str, values: Sequence[SupportsFloat]) -> Self:
-        "Add floating-point type parameter axis with given name and values to sweep over"
-        ...
-    def add_string_axis(self, name: str, values: Sequence[str]) -> Self:
-        "Add string type parameter axis with given name and values to sweep over"
-        ...
-    def set_name(self, name: str) -> Self:
-        "Set benchmark name"
-        ...
-    def set_is_cpu_only(self, is_cpu_only: bool) -> Self:
-        "Set whether this benchmark only executes on CPU"
-        ...
-    def set_run_once(self, v: bool) -> Self:
-        "Set whether all benchmark configurations are executed only once"
-        ...
-    def set_skip_time(self, duration_seconds: SupportsFloat) -> Self:
-        "Set run durations, in seconds, that should be skipped"
-        ...
-    def set_throttle_recovery_delay(self, delay_seconds: SupportsFloat) -> Self:
-        "Set throttle recovery delay, in seconds"
-        ...
-    def set_throttle_threshold(self, threshold: SupportsFloat) -> Self:
-        "Set throttle threshold, as a fraction of maximal GPU frequency"
-        ...
-    def set_timeout(self, duration_seconds: SupportsFloat) -> Self:
-        "Set benchmark run duration timeout value, in seconds"
-        ...
-    def set_stopping_criterion(self, criterion: str) -> Self:
-        "Set stopping criterion to be used"
-        ...
-    def set_criterion_param_float64(self, name: str, value: SupportsFloat) -> Self:
-        "Set stopping criterion floating point parameter value"
-        ...
-    def set_criterion_param_int64(self, name: str, value: SupportsInt) -> Self:
-        "Set stopping criterion integer parameter value"
-        ...
-    def set_criterion_param_string(self, name: str, value: str) -> Self:
-        "Set stopping criterion string parameter value"
-        ...
-    def set_min_samples(self, count: SupportsInt) -> Self:
-        "Set minimal samples count before stopping criterion applies"
-        ...
+    ) -> Self: ...
+    def add_float64_axis(self, name: str, values: Sequence[SupportsFloat]) -> Self: ...
+    def add_string_axis(self, name: str, values: Sequence[str]) -> Self: ...
+    def set_name(self, name: str) -> Self: ...
+    def set_run_once(self, v: bool) -> Self: ...
+    def set_skip_time(self, duration_seconds: SupportsFloat) -> Self: ...
+    def set_throttle_recovery_delay(self, delay_seconds: SupportsFloat) -> Self: ...
+    def set_throttle_threshold(self, threshold: SupportsFloat) -> Self: ...
+    def set_timeout(self, duration_seconds: SupportsFloat) -> Self: ...
+    def set_stopping_criterion(self, criterion: str) -> Self: ...
+    def set_criterion_param_float64(self, name: str, value: SupportsFloat) -> Self: ...
+    def set_criterion_param_int64(self, name: str, value: SupportsInt) -> Self: ...
+    def set_criterion_param_string(self, name: str, value: str) -> Self: ...
+    def set_min_samples(self, count: SupportsInt) -> Self: ...
 
 class Launch:
-    """Configuration object for function launch.
-
-    Note
-    ----
-        The class is not user-constructible.
-    """
-    def get_stream(self) -> CudaStream:
-        "Get CUDA stream of this configuration"
-        ...
+    def get_stream(self) -> CudaStream: ...
 
 class State:
-    """Represent benchmark configuration state.
-
-    Note
-    ----
-        The class is not user-constructible.
-    """
-    def has_device(self) -> bool:
-        "True if configuration has a device"
-        ...
-    def has_printers(self) -> bool:
-        "True if configuration has a printer"
-        ...
-    def get_device(self) -> Union[int, None]:
-        "Get device_id of the device from this configuration"
-        ...
-    def get_stream(self) -> CudaStream:
-        "CudaStream object from this configuration"
-        ...
-    def get_int64(self, name: str) -> int:
-        "Get value for given Int64 axis from this configuration"
-        ...
-    def get_int64_or_default(self, name: str, default_value: SupportsInt) -> int:
-        "Get value for given Int64 axis from this configuration"
-        ...
-    def get_float64(self, name: str) -> float:
-        "Get value for given Float64 axis from this configuration"
-        ...
-    def get_float64_or_default(self, name: str, default_value: SupportsFloat) -> float:
-        "Get value for given Float64 axis from this configuration"
-        ...
-    def get_string(self, name: str) -> str:
-        "Get value for given String axis from this configuration"
-        ...
-    def get_string_or_default(self, name: str, default_value: str) -> str:
-        "Get value for given String axis from this configuration"
-        ...
+    def has_device(self) -> bool: ...
+    def has_printers(self) -> bool: ...
+    def get_device(self) -> Union[int, None]: ...
+    def get_stream(self) -> CudaStream: ...
+    def get_int64(self, name: str) -> int: ...
+    def get_int64_or_default(self, name: str, default_value: SupportsInt) -> int: ...
+    def get_float64(self, name: str) -> float: ...
+    def get_float64_or_default(
+        self, name: str, default_value: SupportsFloat
+    ) -> float: ...
+    def get_string(self, name: str) -> str: ...
+    def get_string_or_default(self, name: str, default_value: str) -> str: ...
     def add_element_count(
         self, count: SupportsInt, column_name: Optional[str] = None
-    ) -> None:
-        "Add element count"
-        ...
-    def set_element_count(self, count: SupportsInt) -> None:
-        "Set element count"
-        ...
-    def get_element_count(self) -> int:
-        "Get element count"
-        ...
-    def skip(self, reason: str) -> None:
-        "Skip this configuration"
-        ...
-    def is_skipped(self) -> bool:
-        "Has this configuration been skipped"
-        ...
-    def get_skip_reason(self) -> str:
-        "Get reason provided for skipping this configuration"
-        ...
+    ) -> None: ...
+    def set_element_count(self, count: SupportsInt) -> None: ...
+    def get_element_count(self) -> int: ...
+    def skip(self, reason: str) -> None: ...
+    def is_skipped(self) -> bool: ...
+    def get_skip_reason(self) -> str: ...
     def add_global_memory_reads(
         self, nbytes: SupportsInt, /, column_name: str = ""
-    ) -> None:
-        "Inform NVBench that given amount of bytes is being read by the benchmark from global memory"
-        ...
+    ) -> None: ...
     def add_global_memory_writes(
         self, nbytes: SupportsInt, /, column_name: str = ""
-    ) -> None:
-        "Inform NVBench that given amount of bytes is being written by the benchmark into global memory"
-        ...
-    def get_benchmark(self) -> Benchmark:
-        "Get Benchmark this configuration is a part of"
-        ...
-    def get_throttle_threshold(self) -> float:
-        "Get throttle threshold value, as fraction of maximal frequency"
-        ...
-    def set_throttle_threshold(self, threshold_fraction: SupportsFloat) -> None:
-        "Set throttle threshold fraction to specified value, expected to be between 0 and 1"
-        ...
-    def get_min_samples(self) -> int:
-        "Get the number of benchmark timings NVBench performs before stopping criterion begins being used"
-        ...
-    def set_min_samples(self, min_samples_count: SupportsInt) -> None:
-        "Set the number of benchmark timings for NVBench to perform before stopping criterion begins being used"
-        ...
-    def get_disable_blocking_kernel(self) -> bool:
-        "True if use of blocking kernel by NVBench is disabled, False otherwise"
-        ...
-    def set_disable_blocking_kernel(self, flag: bool) -> None:
-        "Use flag = True to disable use of blocking kernel by NVBench"
-        ...
-    def get_run_once(self) -> bool:
-        "Boolean flag whether configuration should only run once"
-        ...
-    def set_run_once(self, run_once_flag: bool) -> None:
-        "Set run-once flag for this configuration"
-        ...
-    def get_timeout(self) -> float:
-        "Get time-out value for benchmark execution of this configuration, in seconds"
-        ...
-    def set_timeout(self, duration: SupportsFloat) -> None:
-        "Set time-out value for benchmark execution of this configuration, in seconds"
-        ...
-    def get_blocking_kernel_timeout(self) -> float:
-        "Get time-out value for execution of blocking kernel, in seconds"
-        ...
-    def set_blocking_kernel_timeout(self, duration: SupportsFloat) -> None:
-        "Set time-out value for execution of blocking kernel, in seconds"
-        ...
-    def collect_cupti_metrics(self) -> None:
-        "Request NVBench to record CUPTI metrics while running benchmark for this configuration"
-        ...
-    def is_cupti_required(self) -> bool:
-        "True if (some) CUPTI metrics are being collected"
-        ...
+    ) -> None: ...
+    def get_benchmark(self) -> Benchmark: ...
+    def get_throttle_threshold(self) -> float: ...
+    def set_throttle_threshold(self, threshold_fraction: SupportsFloat) -> None: ...
+    def get_min_samples(self) -> int: ...
+    def set_min_samples(self, min_samples_count: SupportsInt) -> None: ...
+    def get_disable_blocking_kernel(self) -> bool: ...
+    def set_disable_blocking_kernel(self, flag: bool) -> None: ...
+    def get_run_once(self) -> bool: ...
+    def set_run_once(self, run_once_flag: bool) -> None: ...
+    def get_timeout(self) -> float: ...
+    def set_timeout(self, duration: SupportsFloat) -> None: ...
+    def get_blocking_kernel_timeout(self) -> float: ...
+    def set_blocking_kernel_timeout(self, duration: SupportsFloat) -> None: ...
+    def collect_cupti_metrics(self) -> None: ...
+    def is_cupti_required(self) -> bool: ...
     def exec(
         self,
         fn: Callable[[Launch], None],
@@ -253,60 +104,16 @@ class State:
         *,
         batched: Optional[bool] = True,
         sync: Optional[bool] = False,
-    ):
-        """Execute callable running the benchmark.
-
-        The callable may be executed multiple times.
-
-        Parameters
-        ----------
-        fn: Callable
-            Python callable with signature fn(Launch) -> None that executes the benchmark.
-        batched: bool, optional
-            If `True`, no cache flushing is performed between callable invocations.
-            Default: `True`.
-        sync: bool, optional
-            True value indicates that callable performs device synchronization.
-            NVBench disables use of blocking kernel in this case.
-            Default: `False`.
-        """
-        ...
-    def get_short_description(self) -> str:
-        "Get short description for this configuration"
-        ...
+    ): ...
+    def get_short_description(self) -> str: ...
     def add_summary(
         self, column_name: str, value: Union[SupportsInt, SupportsFloat, str]
-    ) -> None:
-        "Add summary column with a value"
-        ...
-    def get_axis_values(self) -> dict[str, int | float | str]:
-        "Get dictionary with axis values for this configuration"
-        ...
-    def get_axis_values_as_string(self) -> str:
-        "Get string of space-separated name=value pairs for this configuration"
-        ...
-    def get_stopping_criterion(self) -> str:
-        "Get string name of stopping criterion used"
-        ...
-
-def register(fn: Callable[[State], None]) -> Benchmark:
-    """
-    Register given benchmarking function with NVBench.
-    """
-    ...
-
-def run_all_benchmarks(argv: Sequence[str]) -> None:
-    """
-    Run all benchmarks registered with NVBench.
-
-    Parameters
-    ----------
-    argv: List[str]
-        Sequence of CLI arguments controlling NVBench. Usually, it is `sys.argv`.
-    """
-    ...
+    ) -> None: ...
+    def get_axis_values(self) -> dict[str, int | float | str]: ...
+    def get_axis_values_as_string(self) -> str: ...
+    def get_stopping_criterion(self) -> str: ...
 
-class NVBenchRuntimeError(RuntimeError):
-    """An exception raised if running benchmarks encounters an error"""
+def register(fn: Callable[[State], None]) -> Benchmark: ...
+def run_all_benchmarks(argv: Sequence[str]) -> None: ...
 
-    ...
+class NVBenchRuntimeError(RuntimeError): ...
diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 2b09574d..e01a33ce 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -35,8 +35,8 @@
 
 namespace py = pybind11;
 
-// namespace
-//{
+namespace
+{
 
 struct PyObjectDeleter
 {
@@ -61,8 +61,6 @@ struct PyObjectDeleter
   }
 };
 
-namespace
-{
 struct benchmark_wrapper_t
 {
 
@@ -114,10 +112,10 @@ struct benchmark_wrapper_t
 
 private:
   // Important to use shared pointer here rather than py::object directly,
-  // since copy constructor must be const (benchmark::do_clone is const member method)
+  // since copy constructor must be const (consequence of benchmark::do_clone
+  // being const member method)
   std::shared_ptr<py::object> m_fn;
 };
-} // namespace
 
 // Use struct to ensure public inheritance
 struct nvbench_run_error : std::runtime_error
@@ -264,58 +262,85 @@ py::dict py_get_axis_values(const nvbench::state &state)
 // essentially a global variable, but allocated on the heap during module initialization
 std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete> global_registry{};
 
-//} // end of anonymous namespace
-
-// ==========================================
-// PLEASE KEEP IN SYNC WITH __init__.pyi FILE
-// ==========================================
-// If you modify these bindings, please be sure to update the
-// corresponding type hints in ``../cuda/nvbench/__init__.pyi``
-
-PYBIND11_MODULE(_nvbench, m)
+// Definitions of Python API
+static void def_class_CudaStream(py::module_ m)
 {
-  // == STEP 1
-  // Set environment variable CUDA_MODULE_LOADING=EAGER
-
-  NVBENCH_DRIVER_API_CALL(cuInit(0));
-
-  // This line ensures that benchmark_manager has been created during module init
-  // It is reinitialized before running all benchmarks to set devices to use
-  nvbench::benchmark_manager::get().initialize();
-
-  // == STEP 2
   // Define CudaStream class
   //    ATTN: nvbench::cuda_stream is move-only class
   //    Methods:
   //       Constructors, based on device, or on existing stream
   //       nvbench::cuda_stream::get_stream
 
-  auto py_cuda_stream_cls = py::class_<nvbench::cuda_stream>(m, "CudaStream");
-
-  py_cuda_stream_cls.def("__cuda_stream__",
-                         [](const nvbench::cuda_stream &s) -> std::pair<std::size_t, std::size_t> {
-                           return std::make_pair(std::size_t{0},
-                                                 reinterpret_cast<std::size_t>(s.get_stream()));
-                         });
-
-  py_cuda_stream_cls.def("addressof", [](const nvbench::cuda_stream &s) -> std::size_t {
+  static constexpr const char *class_CudaStream_doc = R"XXX(
+Represents CUDA stream
+
+    Note
+    ----
+        The class is not user-constructible.
+)XXX";
+
+  auto py_cuda_stream_cls = py::class_<nvbench::cuda_stream>(m, "CudaStream", class_CudaStream_doc);
+
+  auto method__cuda_stream__impl =
+    [](const nvbench::cuda_stream &s) -> std::pair<std::size_t, std::size_t> {
+    return std::make_pair(std::size_t{0}, reinterpret_cast<std::size_t>(s.get_stream()));
+  };
+  static constexpr const char *method__cuda_stream__doc = R"XXX(
+        Special method implement CUDA stream protocol
+        from `cuda.core`. Returns a pair of integers:
+        (protocol_version, integral_value_of_cudaStream_t pointer)
+
+        Example
+        -------
+            import cuda.core.experimental as core
+            import cuda.bench as bench
+
+            def bench(state: bench.State):
+                dev = core.Device(state.get_device())
+                dev.set_current()
+                # converts CudaString to core.Stream
+                # using __cuda_stream__ protocol
+                dev.create_stream(state.get_stream())
+)XXX";
+  py_cuda_stream_cls.def("__cuda_stream__", method__cuda_stream__impl, method__cuda_stream__doc);
+
+  auto method_addressof_impl = [](const nvbench::cuda_stream &s) -> std::size_t {
     return reinterpret_cast<std::size_t>(s.get_stream());
-  });
+  };
+  static constexpr const char *method_addressof_doc =
+    R"XXXX(Integral value of address of driver's CUDA stream struct")XXXX";
+  py_cuda_stream_cls.def("addressof", method_addressof_impl, method_addressof_doc);
+}
 
-  // == STEP 3
+void def_class_Launch(py::module_ m)
+{
   // Define Launch class
   //    ATTN: nvbench::launch is move-only class
   //    Methods:
   //        nvbench::launch::get_stream -> nvbench::cuda_stream
 
-  auto py_launch_cls = py::class_<nvbench::launch>(m, "Launch");
-
-  py_launch_cls.def(
-    "get_stream",
-    [](nvbench::launch &launch) { return std::ref(launch.get_stream()); },
-    py::return_value_policy::reference);
+  static constexpr const char *class_Launch_doc = R"XXXX(
+Configuration object for function launch.
+
+    Note
+    ----
+        The class is not user-constructible.
+)XXXX";
+  auto py_launch_cls = py::class_<nvbench::launch>(m, "Launch", class_Launch_doc);
+
+  auto method_get_stream_impl = [](nvbench::launch &launch) {
+    return std::ref(launch.get_stream());
+  };
+  static constexpr const char *method_get_stream_doc =
+    R"XXXX(Get CUDA stream of this configuration)XXXX";
+  py_launch_cls.def("get_stream",
+                    method_get_stream_impl,
+                    method_get_stream_doc,
+                    py::return_value_policy::reference);
+}
 
-  // == STEP 4
+static void def_class_Benchmark(py::module_ m)
+{
   // Define Benchmark class
   //    ATTN: nvbench::benchmark_base is move-only class
   //    Methods:
@@ -336,164 +361,268 @@ PYBIND11_MODULE(_nvbench, m)
   //        nvbench::benchmark_base::set_criterion_param_string
   //        nvbench::benchmark_base::set_min_samples
 
-  auto py_benchmark_cls = py::class_<nvbench::benchmark_base>(m, "Benchmark");
-  py_benchmark_cls.def("get_name", &nvbench::benchmark_base::get_name);
+  static constexpr const char *class_Benchmark_doc = R"XXXX(
+Represents NVBench benchmark.
 
-  py_benchmark_cls.def(
-    "add_int64_axis",
-    [](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::int64_t> data) {
-      self.add_int64_axis(std::move(name), std::move(data));
-      return std::ref(self);
-    },
-    py::return_value_policy::reference,
-    py::arg("name"),
-    py::arg("values"));
+    Note
+    ----
+        The class is not user-constructible.
 
-  py_benchmark_cls.def(
-    "add_int64_power_of_two_axis",
+        Use `~register` function to create Benchmark and register
+        it with NVBench.
+)XXXX";
+  auto py_benchmark_cls = py::class_<nvbench::benchmark_base>(m, "Benchmark", class_Benchmark_doc);
+
+  // method Benchmark.get_name
+  auto method_get_name_impl                        = &nvbench::benchmark_base::get_name;
+  static constexpr const char *method_get_name_doc = R"XXXX(Get benchmark name)XXXX";
+  py_benchmark_cls.def("get_name", method_get_name_impl, method_get_name_doc);
+
+  // method Benchmark.add_int64_axis
+  auto method_add_int64_axis_impl =
     [](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::int64_t> data) {
-      self.add_int64_axis(std::move(name),
-                          std::move(data),
-                          nvbench::int64_axis_flags::power_of_two);
+      self.add_int64_axis(std::move(name), std::move(data));
       return std::ref(self);
-    },
-    py::return_value_policy::reference,
-    py::arg("name"),
-    py::arg("values"));
-
-  py_benchmark_cls.def(
-    "add_float64_axis",
+    };
+  static constexpr const char *method_add_int64_axis_doc = R"XXXX(
+Add integral type parameter axis with given name and values to sweep over
+)XXXX";
+  py_benchmark_cls.def("add_int64_axis",
+                       method_add_int64_axis_impl,
+                       method_add_int64_axis_doc,
+                       py::return_value_policy::reference,
+                       py::arg("name"),
+                       py::arg("values"));
+
+  // method Benchmark.add_int64_power_of_two_axis
+  auto method_add_int64_power_of_two_axis_impl = [](nvbench::benchmark_base &self,
+                                                    std::string name,
+                                                    std::vector<nvbench::int64_t> data) {
+    self.add_int64_axis(std::move(name), std::move(data), nvbench::int64_axis_flags::power_of_two);
+    return std::ref(self);
+  };
+  static constexpr const char *method_add_int64_power_of_two_axis_doc = R"XXXX(
+Add integral type parameter axis with given name and power of two values to sweep over
+)XXXX";
+  py_benchmark_cls.def("add_int64_power_of_two_axis",
+                       method_add_int64_power_of_two_axis_impl,
+                       method_add_int64_power_of_two_axis_doc,
+                       py::return_value_policy::reference,
+                       py::arg("name"),
+                       py::arg("values"));
+
+  // method Benchmark.add_float64_axis
+  auto method_add_float64_axis_impl =
     [](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::float64_t> data) {
       self.add_float64_axis(std::move(name), std::move(data));
       return std::ref(self);
-    },
-    py::return_value_policy::reference,
-    py::arg("name"),
-    py::arg("values"));
-
-  py_benchmark_cls.def(
-    "add_string_axis",
+    };
+  static constexpr const char *method_add_float64_axis_doc = R"XXXX(
+Add floating-point type parameter axis with given name and values to sweep over"
+)XXXX";
+  py_benchmark_cls.def("add_float64_axis",
+                       method_add_float64_axis_impl,
+                       method_add_float64_axis_doc,
+                       py::return_value_policy::reference,
+                       py::arg("name"),
+                       py::arg("values"));
+
+  // method Benchmark.add_string_axis
+  auto method_add_string_axis_impl =
     [](nvbench::benchmark_base &self, std::string name, std::vector<std::string> data) {
       self.add_string_axis(std::move(name), std::move(data));
       return std::ref(self);
-    },
-    py::return_value_policy::reference,
-    py::arg("name"),
-    py::arg("values"));
-
-  py_benchmark_cls.def(
-    "set_name",
-    [](nvbench::benchmark_base &self, std::string name) {
-      self.set_name(std::move(name));
-      return std::ref(self);
-    },
-    py::return_value_policy::reference,
-    py::arg("name"));
-
-  py_benchmark_cls.def(
-    "set_is_cpu_only",
-    [](nvbench::benchmark_base &self, bool is_cpu_only) {
-      self.set_is_cpu_only(is_cpu_only);
-      return std::ref(self);
-    },
-    py::return_value_policy::reference,
-    py::arg("is_cpu_only"));
-
+    };
+  static constexpr const char *method_add_string_axis_doc = R"XXXX(
+Add string type parameter axis with given name and values to sweep over
+)XXXX";
+  py_benchmark_cls.def("add_string_axis",
+                       method_add_string_axis_impl,
+                       method_add_string_axis_doc,
+                       py::return_value_policy::reference,
+                       py::arg("name"),
+                       py::arg("values"));
+
+  // method Benchmark.set_name
+  auto method_set_name_impl = [](nvbench::benchmark_base &self, std::string name) {
+    self.set_name(std::move(name));
+    return std::ref(self);
+  };
+  static constexpr const char *method_set_name_doc = R"XXXX(Set benchmark name)XXXX";
+  py_benchmark_cls.def("set_name",
+                       method_set_name_impl,
+                       method_set_name_doc,
+                       py::return_value_policy::reference,
+                       py::arg("name"));
+
+  // method Benchmark.set_is_cpu_only
+  auto method_set_is_cpu_only_impl = [](nvbench::benchmark_base &self, bool is_cpu_only) {
+    self.set_is_cpu_only(is_cpu_only);
+    return std::ref(self);
+  };
+  static constexpr const char *method_set_is_cpu_only_doc =
+    R"XXXX(Set whether this benchmark only executes on CPU)XXXX";
+  py_benchmark_cls.def("set_is_cpu_only",
+                       method_set_is_cpu_only_impl,
+                       method_set_is_cpu_only_doc,
+                       py::return_value_policy::reference,
+                       py::arg("is_cpu_only"));
+
+  // method Benchmark.set_run_once
+  auto method_set_run_once_impl = [](nvbench::benchmark_base &self, bool run_once) {
+    self.set_run_once(run_once);
+    return std::ref(self);
+  };
+  static constexpr const char *method_set_run_once_doc = R"XXXX(
+Set whether all benchmark configurations are executed only once
+)XXXX";
   // TODO: should this be exposed?
-  py_benchmark_cls.def(
-    "set_run_once",
-    [](nvbench::benchmark_base &self, bool run_once) {
-      self.set_run_once(run_once);
-      return std::ref(self);
-    },
-    py::return_value_policy::reference,
-    py::arg("run_once"));
-
-  py_benchmark_cls.def(
-    "set_skip_time",
-    [](nvbench::benchmark_base &self, nvbench::float64_t skip_duration_seconds) {
-      self.set_skip_time(skip_duration_seconds);
-      return std::ref(self);
-    },
-    py::return_value_policy::reference,
-    py::arg("duration_seconds"));
-
-  py_benchmark_cls.def(
-    "set_timeout",
-    [](nvbench::benchmark_base &self, nvbench::float64_t duration_seconds) {
-      self.set_timeout(duration_seconds);
-      return std::ref(self);
-    },
-    py::return_value_policy::reference,
-    py::arg("duration_seconds"));
-
-  py_benchmark_cls.def(
-    "set_throttle_threshold",
-    [](nvbench::benchmark_base &self, nvbench::float32_t threshold) {
-      self.set_throttle_threshold(threshold);
-      return std::ref(self);
-    },
-    py::return_value_policy::reference,
-    py::arg("threshold"));
-
-  py_benchmark_cls.def(
-    "set_throttle_recovery_delay",
-    [](nvbench::benchmark_base &self, nvbench::float32_t delay) {
-      self.set_throttle_recovery_delay(delay);
-      return std::ref(self);
-    },
-    py::return_value_policy::reference,
-    py::arg("delay_seconds"));
-
-  py_benchmark_cls.def(
-    "set_stopping_criterion",
-    [](nvbench::benchmark_base &self, std::string criterion) {
-      self.set_stopping_criterion(std::move(criterion));
-      return std::ref(self);
-    },
-    py::return_value_policy::reference,
-    py::arg("criterion"));
-
-  py_benchmark_cls.def(
-    "set_criterion_param_int64",
+  py_benchmark_cls.def("set_run_once",
+                       method_set_run_once_impl,
+                       method_set_run_once_doc,
+                       py::return_value_policy::reference,
+                       py::arg("run_once"));
+
+  // method Benchmark.set_skip_time
+  auto method_set_skip_time_impl = [](nvbench::benchmark_base &self,
+                                      nvbench::float64_t skip_duration_seconds) {
+    self.set_skip_time(skip_duration_seconds);
+    return std::ref(self);
+  };
+  static constexpr const char *method_set_skip_time_doc = R"XXXX(
+Set value, in seconds, such that runs with duration shorter than this are skipped
+)XXXX";
+  py_benchmark_cls.def("set_skip_time",
+                       method_set_skip_time_impl,
+                       method_set_skip_time_doc,
+                       py::return_value_policy::reference,
+                       py::arg("duration_seconds"));
+
+  // method Benchmark.set_timeout
+  auto method_set_timeout_impl = [](nvbench::benchmark_base &self,
+                                    nvbench::float64_t duration_seconds) {
+    self.set_timeout(duration_seconds);
+    return std::ref(self);
+  };
+  static constexpr const char *method_set_timeout_doc = R"XXXX(
+Set benchmark run duration timeout value, in seconds
+)XXXX";
+  py_benchmark_cls.def("set_timeout",
+                       method_set_timeout_impl,
+                       method_set_timeout_doc,
+                       py::return_value_policy::reference,
+                       py::arg("duration_seconds"));
+
+  // method Benchmark.set_throttle_threshold
+  auto method_set_throttle_threshold_impl = [](nvbench::benchmark_base &self,
+                                               nvbench::float32_t threshold) {
+    self.set_throttle_threshold(threshold);
+    return std::ref(self);
+  };
+  static constexpr const char *method_set_throttle_threshold_doc = R"XXXX(
+Set throttle threshold, as a fraction of maximal GPU frequency, in percents
+)XXXX";
+  py_benchmark_cls.def("set_throttle_threshold",
+                       method_set_throttle_threshold_impl,
+                       method_set_throttle_threshold_doc,
+                       py::return_value_policy::reference,
+                       py::arg("threshold"));
+
+  // method Benchmark.set_throttle_recovery_delay
+  auto method_set_throttle_recovery_delay_impl = [](nvbench::benchmark_base &self,
+                                                    nvbench::float32_t delay) {
+    self.set_throttle_recovery_delay(delay);
+    return std::ref(self);
+  };
+  static constexpr const char *method_set_throttle_recovery_delay_doc = R"XXXX(
+Set throttle recovery delay, in seconds
+)XXXX";
+  py_benchmark_cls.def("set_throttle_recovery_delay",
+                       method_set_throttle_recovery_delay_impl,
+                       method_set_throttle_recovery_delay_doc,
+                       py::return_value_policy::reference,
+                       py::arg("delay_seconds"));
+
+  // method Benchmark.set_stopping_criterion
+  auto method_set_stopping_criterion_impl = [](nvbench::benchmark_base &self,
+                                               std::string criterion) {
+    self.set_stopping_criterion(std::move(criterion));
+    return std::ref(self);
+  };
+  static constexpr const char *method_set_stopping_criterion_doc = R"XXXX(
+Set stopping criterion to be used
+)XXXX";
+  py_benchmark_cls.def("set_stopping_criterion",
+                       method_set_stopping_criterion_impl,
+                       method_set_stopping_criterion_doc,
+                       py::return_value_policy::reference,
+                       py::arg("criterion"));
+
+  // method Benchmark.set_criterion_param_int64
+  auto method_set_criterion_param_int64_impl =
     [](nvbench::benchmark_base &self, std::string name, nvbench::int64_t value) {
       self.set_criterion_param_int64(std::move(name), value);
       return std::ref(self);
-    },
-    py::return_value_policy::reference,
-    py::arg("name"),
-    py::arg("value"));
-
-  py_benchmark_cls.def(
-    "set_criterion_param_float64",
+    };
+  static constexpr const char *method_set_criterion_param_int64_doc = R"XXXX(
+Set stopping criterion integer parameter value
+)XXXX";
+  py_benchmark_cls.def("set_criterion_param_int64",
+                       method_set_criterion_param_int64_impl,
+                       method_set_criterion_param_int64_doc,
+                       py::return_value_policy::reference,
+                       py::arg("name"),
+                       py::arg("value"));
+
+  // method Benchmark.set_criterion_param_float64
+  auto method_set_criterion_param_float64_impl =
     [](nvbench::benchmark_base &self, std::string name, nvbench::float64_t value) {
       self.set_criterion_param_float64(std::move(name), value);
       return std::ref(self);
-    },
-    py::return_value_policy::reference,
-    py::arg("name"),
-    py::arg("value"));
-
-  py_benchmark_cls.def(
-    "set_criterion_param_string",
+    };
+  static constexpr const char *method_set_criterion_param_float64_doc = R"XXXX(
+Set stopping criterion floating point parameter value"
+)XXXX";
+  py_benchmark_cls.def("set_criterion_param_float64",
+                       method_set_criterion_param_float64_impl,
+                       method_set_criterion_param_float64_doc,
+                       py::return_value_policy::reference,
+                       py::arg("name"),
+                       py::arg("value"));
+
+  // method Benchmark.set_criterion_param_string
+  auto method_set_criterion_param_string_impl =
     [](nvbench::benchmark_base &self, std::string name, std::string value) {
       self.set_criterion_param_string(std::move(name), std::move(value));
       return std::ref(self);
-    },
-    py::return_value_policy::reference,
-    py::arg("name"),
-    py::arg("value"));
-
-  py_benchmark_cls.def(
-    "set_min_samples",
-    [](nvbench::benchmark_base &self, nvbench::int64_t count) {
-      self.set_min_samples(count);
-      return std::ref(self);
-    },
-    py::return_value_policy::reference,
-    py::arg("min_samples_count"));
+    };
+  static constexpr const char *method_set_criterion_param_string_doc = R"XXXX(
+Set stopping criterion string parameter value
+)XXXX";
+  py_benchmark_cls.def("set_criterion_param_string",
+                       method_set_criterion_param_string_impl,
+                       method_set_criterion_param_string_doc,
+                       py::return_value_policy::reference,
+                       py::arg("name"),
+                       py::arg("value"));
+
+  // method Benchmark.set_min_samples
+  auto method_set_min_samples_impl = [](nvbench::benchmark_base &self, nvbench::int64_t count) {
+    self.set_min_samples(count);
+    return std::ref(self);
+  };
+  static constexpr const char *method_set_min_samples_doc = R"XXXX(
+Set minimal samples count before stopping criterion applies
+)XXXX";
+  py_benchmark_cls.def("set_min_samples",
+                       method_set_min_samples_impl,
+                       method_set_min_samples_doc,
+                       py::return_value_policy::reference,
+                       py::arg("min_samples_count"));
+}
 
-  // == STEP 5
+void def_class_State(py::module_ m)
+{
   // Define PyState class
   //    ATTN: nvbench::state is move-only class
   //    Methods:
@@ -561,244 +690,520 @@ PYBIND11_MODULE(_nvbench, m)
   // NOTE:
   //    State wraps std::reference_wrapper<nvbench::state>
 
-  using state_ref_t = std::reference_wrapper<nvbench::state>;
-  auto pystate_cls  = py::class_<nvbench::state>(m, "State");
+  using state_ref_t                            = std::reference_wrapper<nvbench::state>;
+  static constexpr const char *class_State_doc = R"XXXX(
+Represent benchmark configuration state.
 
-  pystate_cls.def("has_device", [](const nvbench::state &state) -> bool {
-    return static_cast<bool>(state.get_device());
-  });
+    Note
+    ----
+        The class is not user-constructible.
+)XXXX";
+  auto pystate_cls = py::class_<nvbench::state>(m, "State", class_State_doc);
 
-  pystate_cls.def("has_printers", [](const nvbench::state &state) -> bool {
+  // method State.has_device
+  auto method_has_device_impl = [](const nvbench::state &state) -> bool {
+    return static_cast<bool>(state.get_device());
+  };
+  static constexpr const char *method_has_device_doc = R"XXXX(
+Returns True if configuration has a device
+)XXXX";
+  pystate_cls.def("has_device", method_has_device_impl, method_has_device_doc);
+
+  // method State.has_printers
+  auto method_has_printers_impl = [](const nvbench::state &state) -> bool {
     return state.get_benchmark().get_printer().has_value();
-  });
-
-  pystate_cls.def("get_device", [](const nvbench::state &state) {
+  };
+  static constexpr const char *method_has_printers_doc = R"XXXX(
+Returns True if configuration has a printer"
+)XXXX";
+  pystate_cls.def("has_printers", method_has_printers_impl, method_has_printers_doc);
+
+  // method State.get_device
+  auto method_get_device_impl = [](const nvbench::state &state) {
     auto dev = state.get_device();
     if (dev.has_value())
     {
       return py::cast(dev.value().get_id());
     }
     return py::object(py::none());
-  });
-
-  pystate_cls.def(
-    "get_stream",
-    [](nvbench::state &state) { return std::ref(state.get_cuda_stream()); },
-    py::return_value_policy::reference);
-
-  pystate_cls.def("get_int64", &nvbench::state::get_int64, py::arg("name"));
+  };
+  static constexpr const char *method_get_device_doc = R"XXXX(
+Get device_id of the device from this configuration
+)XXXX";
+  pystate_cls.def("get_device", method_get_device_impl, method_get_device_doc);
+
+  // method State.get_stream
+  auto method_get_stream_impl = [](nvbench::state &state) {
+    return std::ref(state.get_cuda_stream());
+  };
+  static constexpr const char *method_get_stream_doc = R"XXXX(
+Get `~CudaStream` object from this configuration"
+)XXXX";
+  pystate_cls.def("get_stream",
+                  method_get_stream_impl,
+                  method_get_stream_doc,
+                  py::return_value_policy::reference);
+
+  // method State.get_int64
+  auto method_get_int64_impl                        = &nvbench::state::get_int64;
+  static constexpr const char *method_get_int64_doc = R"XXXX(
+Get value for given Int64 axis from this configuration
+)XXXX";
+  pystate_cls.def("get_int64", method_get_int64_impl, method_get_int64_doc, py::arg("name"));
+
+  // method State.get_int64_or_default
+  auto method_get_int64_or_default_impl = &nvbench::state::get_int64_or_default;
+  static constexpr const char *method_get_int64_or_default_doc = method_get_int64_doc;
   pystate_cls.def("get_int64_or_default",
-                  &nvbench::state::get_int64_or_default,
+                  method_get_int64_or_default_impl,
+                  method_get_int64_or_default_doc,
                   py::arg("name"),
                   py::pos_only{},
                   py::arg("default_value"));
 
-  pystate_cls.def("get_float64", &nvbench::state::get_float64, py::arg("name"));
+  // method State.get_float64
+  auto method_get_float64_impl                        = &nvbench::state::get_float64;
+  static constexpr const char *method_get_float64_doc = R"XXXX(
+Get value for given Float64 axis from this configuration
+)XXXX";
+  pystate_cls.def("get_float64", method_get_float64_impl, method_get_float64_doc, py::arg("name"));
+
+  // method State.get_float64_or_default
+  static constexpr const char *method_get_float64_or_default_doc = method_get_float64_doc;
   pystate_cls.def("get_float64_or_default",
                   &nvbench::state::get_float64_or_default,
+                  method_get_float64_or_default_doc,
                   py::arg("name"),
                   py::pos_only{},
                   py::arg("default_value"));
 
-  pystate_cls.def("get_string", &nvbench::state::get_string, py::arg("name"));
+  // method State.get_string
+  static constexpr const char *method_get_string_doc = R"XXXX(
+Get value for given String axis from this configuration
+)XXXX";
+  pystate_cls.def("get_string", &nvbench::state::get_string, method_get_string_doc, py::arg("name"));
+
+  // method State.get_string_or_default
+  static constexpr const char *method_get_string_or_default_doc = method_get_string_doc;
   pystate_cls.def("get_string_or_default",
                   &nvbench::state::get_string_or_default,
+                  method_get_string_or_default_doc,
                   py::arg("name"),
                   py::pos_only{},
                   py::arg("default_value"));
 
+  // method State.get_element_count
+  static constexpr const char *method_add_element_count_doc = R"XXXX(
+Add element count"
+)XXXX";
   pystate_cls.def("add_element_count",
                   &nvbench::state::add_element_count,
+                  method_add_element_count_doc,
                   py::arg("count"),
                   py::arg("column_name") = py::str(""));
 
-  pystate_cls.def("set_element_count", &nvbench::state::set_element_count, py::arg("count"));
-  pystate_cls.def("get_element_count", &nvbench::state::get_element_count);
-
+  // method State.set_element_count
+  static constexpr const char *method_set_element_count_doc = R"XXXX(
+Set element count
+)XXXX";
+  pystate_cls.def("set_element_count",
+                  &nvbench::state::set_element_count,
+                  method_set_element_count_doc,
+                  py::arg("count"));
+
+  // method State.get_element_count
+  static constexpr const char *method_get_element_count = R"XXXX(
+Get element count
+)XXXX";
+  pystate_cls.def("get_element_count",
+                  &nvbench::state::get_element_count,
+                  method_get_element_count);
+
+  // method State.skip
+  static constexpr const char *method_skip_doc = "Skip this configuration";
   pystate_cls.def("skip", &nvbench::state::skip, py::arg("reason"));
-  pystate_cls.def("is_skipped", &nvbench::state::is_skipped);
-  pystate_cls.def("get_skip_reason", &nvbench::state::get_skip_reason);
 
-  pystate_cls.def(
-    "add_global_memory_reads",
+  // method State.is_skipped
+  static constexpr const char *method_is_skipped_doc = R"XXXX(
+Returns True if this configuration is being skipped";
+)XXXX";
+  pystate_cls.def("is_skipped", &nvbench::state::is_skipped, method_is_skipped_doc);
+
+  // method State.get_skip_reason
+  static constexpr const char *method_get_skip_reason_doc = R"XXXX(
+Get reason provided for skipping this configuration
+)XXXX";
+  pystate_cls.def("get_skip_reason", &nvbench::state::get_skip_reason, method_get_skip_reason_doc);
+
+  // method State.add_global_memory_reads
+  auto method_add_global_memory_reads_impl =
     [](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void {
-      state.add_global_memory_reads(nbytes, column_name);
-    },
-    "Add size, in bytes, of global memory reads",
-    py::arg("nbytes"),
-    py::pos_only{},
-    py::arg("column_name") = py::str(""));
-
-  pystate_cls.def(
-    "add_global_memory_writes",
+    state.add_global_memory_reads(nbytes, column_name);
+  };
+  static constexpr const char *method_add_global_memory_reads_doc = R"XXXX(
+Inform NVBench that given amount of bytes is being read by the benchmark from global memory
+)XXXX";
+  pystate_cls.def("add_global_memory_reads",
+                  method_add_global_memory_reads_impl,
+                  method_add_global_memory_reads_doc,
+                  py::arg("nbytes"),
+                  py::pos_only{},
+                  py::arg("column_name") = py::str(""));
+
+  // method State.add_global_memory_writes
+  auto method_add_global_memory_writes_impl =
     [](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void {
-      state.add_global_memory_writes(nbytes, column_name);
-    },
-    "Add size, in bytes, of global memory writes",
-    py::arg("nbytes"),
-    py::pos_only{},
-    py::arg("column_name") = py::str(""));
-
-  pystate_cls.def(
-    "get_benchmark",
-    [](const nvbench::state &state) { return std::ref(state.get_benchmark()); },
-    py::return_value_policy::reference);
-
-  pystate_cls.def("get_throttle_threshold", &nvbench::state::get_throttle_threshold);
+    state.add_global_memory_writes(nbytes, column_name);
+  };
+  static constexpr const char *method_add_global_memory_writes_doc = R"XXXX(
+Inform NVBench that given amount of bytes is being written by the benchmark into global memory
+)XXXX";
+  pystate_cls.def("add_global_memory_writes",
+                  method_add_global_memory_writes_impl,
+                  method_add_global_memory_writes_doc,
+                  py::arg("nbytes"),
+                  py::pos_only{},
+                  py::arg("column_name") = py::str(""));
+
+  // method State.get_benchmark
+  auto method_get_benchmark_impl = [](const nvbench::state &state) {
+    return std::ref(state.get_benchmark());
+  };
+  static constexpr const char *method_get_benchmark_doc = R"XXXX(
+Get Benchmark this configuration is a part of
+)XXXX";
+  pystate_cls.def("get_benchmark",
+                  method_get_benchmark_impl,
+                  method_get_benchmark_doc,
+                  py::return_value_policy::reference);
+
+  // method State.get_throttle_threshold
+  static constexpr const char *method_get_throttle_threshold_doc = R"XXXX(
+Get throttle threshold value, as fraction of maximal frequency.
+
+Note
+----
+    A valid threshold value is between 0 and 1.
+)XXXX";
+  pystate_cls.def("get_throttle_threshold",
+                  &nvbench::state::get_throttle_threshold,
+                  method_get_throttle_threshold_doc);
+
+  // method State.set_throttle_threshold
+  static constexpr const char *method_set_throttle_threshold_doc = R"XXXX(
+Set throttle threshold fraction to the specified value, expected to be between 0 and 1"
+)XXXX";
   pystate_cls.def("set_throttle_threshold",
                   &nvbench::state::set_throttle_threshold,
+                  method_set_throttle_threshold_doc,
                   py::arg("throttle_fraction"));
 
-  pystate_cls.def("get_min_samples", &nvbench::state::get_min_samples);
+  // method State.get_min_samples
+  static constexpr const char *method_get_min_samples_doc = R"XXXX(
+Get the number of benchmark timings NVBench performs before stopping criterion begins being used
+)XXXX";
+  pystate_cls.def("get_min_samples", &nvbench::state::get_min_samples, method_get_min_samples_doc);
+
+  // method State.set_min_samples
+  static constexpr const char *method_set_min_samples_doc = R"XXXX(
+Set the number of benchmark timings for NVBench to perform before stopping criterion begins being used
+)XXXX";
   pystate_cls.def("set_min_samples",
                   &nvbench::state::set_min_samples,
+                  method_set_min_samples_doc,
                   py::arg("min_samples_count"));
 
-  pystate_cls.def("get_disable_blocking_kernel", &nvbench::state::get_disable_blocking_kernel);
-
+  // method State.get_disable_blocking_kernel
+  static constexpr const char *method_get_disable_blocking_kernel_doc = R"XXXX(
+Return True if use of blocking kernel by NVBench is disabled, False otherwise
+)XXXX";
+  pystate_cls.def("get_disable_blocking_kernel",
+                  &nvbench::state::get_disable_blocking_kernel,
+                  method_get_disable_blocking_kernel_doc);
+
+  // method State.set_disable_blocking_kernel
+  static constexpr const char *method_set_disable_blocking_kernel_doc = R"XXXX(
+Use argument True to disable use of blocking kernel by NVBench"
+)XXXX";
   pystate_cls.def("set_disable_blocking_kernel",
                   &nvbench::state::set_disable_blocking_kernel,
+                  method_set_disable_blocking_kernel_doc,
                   py::arg("disable_blocking_kernel"));
 
-  pystate_cls.def("get_run_once", &nvbench::state::get_run_once);
-
-  pystate_cls.def("set_run_once", &nvbench::state::set_run_once, py::arg("run_once"));
-
-  pystate_cls.def("get_timeout", &nvbench::state::get_timeout);
-
-  pystate_cls.def("set_timeout", &nvbench::state::set_timeout, py::arg("duration"));
-
-  pystate_cls.def("get_blocking_kernel_timeout", &nvbench::state::get_blocking_kernel_timeout);
-
+  // method State.get_run_once
+  static constexpr const char *method_get_run_once_doc =
+    R"XXXX(Boolean flag indicating whether configuration should only run once)XXXX";
+  pystate_cls.def("get_run_once", &nvbench::state::get_run_once, method_get_run_once_doc);
+
+  // method State.set_run_once
+  static constexpr const char *method_set_run_once_doc =
+    R"XXXX(Set run-once flag for this configuration)XXXX";
+  pystate_cls.def("set_run_once",
+                  &nvbench::state::set_run_once,
+                  method_set_run_once_doc,
+                  py::arg("run_once"));
+
+  // method State.get_timeout
+  static constexpr const char *method_get_timeout_doc =
+    R"XXXX(Get time-out value for benchmark execution of this configuration, in seconds)XXXX";
+  pystate_cls.def("get_timeout", &nvbench::state::get_timeout, method_get_timeout_doc);
+
+  // method State.set_timeout
+  static constexpr const char *method_set_timeout_doc =
+    R"XXXX(Set time-out value for benchmark execution of this configuration, in seconds)XXXX";
+  pystate_cls.def("set_timeout",
+                  &nvbench::state::set_timeout,
+                  method_set_timeout_doc,
+                  py::arg("duration_seconds"));
+
+  // method State.get_blocking_kernel_timeout
+  static constexpr const char *method_get_blocking_kernel_timeout_doc =
+    R"XXXX(Get time-out value for execution of blocking kernel, in seconds)XXXX";
+  pystate_cls.def("get_blocking_kernel_timeout",
+                  &nvbench::state::get_blocking_kernel_timeout,
+                  method_get_blocking_kernel_timeout_doc);
+
+  // method State.set_blocking_kernel_timeout
+  static constexpr const char *method_set_blocking_kernel_timeout_doc =
+    R"XXXX(Set time-out value for execution of blocking kernel, in seconds)XXXX";
   pystate_cls.def("set_blocking_kernel_timeout",
                   &nvbench::state::set_blocking_kernel_timeout,
-                  py::arg("duration"));
-
-  pystate_cls.def("collect_cupti_metrics", &nvbench::state::collect_cupti_metrics);
+                  method_set_blocking_kernel_timeout_doc,
+                  py::arg("duration_seconds"));
+
+  // method State.collect_cupti_metrics
+  static constexpr const char *method_collect_cupti_metrics_doc =
+    R"XXXX(Request NVBench to record CUPTI metrics while running benchmark for this configuration)XXXX";
+  pystate_cls.def("collect_cupti_metrics",
+                  &nvbench::state::collect_cupti_metrics,
+                  method_collect_cupti_metrics_doc);
+
+  // method State.is_cupti_required
+  static constexpr const char *method_is_cupti_required_doc =
+    R"XXXX(True if (some) CUPTI metrics are being collected)XXXX";
+  pystate_cls.def("is_cupti_required",
+                  &nvbench::state::is_cupti_required,
+                  method_is_cupti_required_doc);
+
+  // method State.exec
+  auto method_exec_impl =
+    [](nvbench::state &state, py::object py_launcher_fn, bool batched, bool sync) -> void {
+    if (!PyCallable_Check(py_launcher_fn.ptr()))
+    {
+      throw py::type_error("Argument of exec method must be a callable object");
+    }
 
-  pystate_cls.def("is_cupti_required", &nvbench::state::is_cupti_required);
+    // wrapper to invoke Python callable
+    auto cpp_launcher_fn = [py_launcher_fn](nvbench::launch &launch_descr) -> void {
+      // cast C++ object to python object
+      auto launch_pyarg = py::cast(std::ref(launch_descr), py::return_value_policy::reference);
+      // call Python callable
+      py_launcher_fn(launch_pyarg);
+    };
 
-  pystate_cls.def(
-    "exec",
-    [](nvbench::state &state, py::object py_launcher_fn, bool batched, bool sync) {
-      if (!PyCallable_Check(py_launcher_fn.ptr()))
+    if (sync)
+    {
+      if (batched)
       {
-        throw py::type_error("Argument of exec method must be a callable object");
+        constexpr auto tag = nvbench::exec_tag::sync;
+        state.exec(tag, cpp_launcher_fn);
       }
-
-      // wrapper to invoke Python callable
-      auto cpp_launcher_fn = [py_launcher_fn](nvbench::launch &launch_descr) -> void {
-        // cast C++ object to python object
-        auto launch_pyarg = py::cast(std::ref(launch_descr), py::return_value_policy::reference);
-        // call Python callable
-        py_launcher_fn(launch_pyarg);
-      };
-
-      if (sync)
+      else
       {
-        if (batched)
-        {
-          constexpr auto tag = nvbench::exec_tag::sync;
-          state.exec(tag, cpp_launcher_fn);
-        }
-        else
-        {
-          constexpr auto tag = nvbench::exec_tag::sync | nvbench::exec_tag::no_batch;
-          state.exec(tag, cpp_launcher_fn);
-        }
+        constexpr auto tag = nvbench::exec_tag::sync | nvbench::exec_tag::no_batch;
+        state.exec(tag, cpp_launcher_fn);
+      }
+    }
+    else
+    {
+      if (batched)
+      {
+        constexpr auto tag = nvbench::exec_tag::none;
+        state.exec(tag, cpp_launcher_fn);
       }
       else
       {
-        if (batched)
-        {
-          constexpr auto tag = nvbench::exec_tag::none;
-          state.exec(tag, cpp_launcher_fn);
-        }
-        else
-        {
-          constexpr auto tag = nvbench::exec_tag::no_batch;
-          state.exec(tag, cpp_launcher_fn);
-        }
+        constexpr auto tag = nvbench::exec_tag::no_batch;
+        state.exec(tag, cpp_launcher_fn);
       }
-    },
-    "Executor for given launcher callable fn(state : Launch)",
-    py::arg("launcher_fn"),
-    py::pos_only{},
-    py::arg("batched") = true,
-    py::arg("sync")    = false);
+    }
+  };
+  static constexpr const char *method_exec_doc = R"XXXX(
+Execute callable running the benchmark.
+
+    The callable may be executed multiple times. The callable
+    will be passed `~Launch` object argument.
+
+    Parameters
+    ----------
+        fn: Callable
+            Python callable with signature fn(Launch) -> None that executes the benchmark.
+        batched: bool, optional
+            If `True`, no cache flushing is performed between callable invocations.
+            Default: `True`.
+        sync: bool, optional
+            True value indicates that callable performs device synchronization.
+            NVBench disables use of blocking kernel in this case.
+            Default: `False`.
+
+)XXXX";
+  pystate_cls.def("exec",
+                  method_exec_impl,
+                  method_exec_doc,
+                  py::arg("launcher_fn"),
+                  py::pos_only{},
+                  py::arg("batched") = true,
+                  py::arg("sync")    = false);
 
+  // method State.get_short_description
+  static constexpr const char *method_get_short_description_doc = R"XXXX(
+Get short description for this configuration
+)XXXX";
   pystate_cls.def("get_short_description",
-                  [](const nvbench::state &state) { return state.get_short_description(); });
+                  &nvbench::state::get_short_description,
+                  method_get_short_description_doc);
 
-  pystate_cls.def(
-    "add_summary",
+  // method State.add_summary
+  auto method_add_summary_string_value_impl =
     [](nvbench::state &state, std::string column_name, std::string value) {
       auto &summ = state.add_summary("nv/python/" + column_name);
       summ.set_string("description", "User tag: " + column_name);
       summ.set_string("name", std::move(column_name));
       summ.set_string("value", std::move(value));
-    },
-    py::arg("name"),
-    py::arg("value"));
+    };
+  static constexpr const char *method_add_summary_doc = R"XXXX(
+Add summary column with given name and value
+)XXXX";
+  pystate_cls.def("add_summary",
+                  method_add_summary_string_value_impl,
+                  method_add_summary_doc,
+                  py::arg("name"),
+                  py::arg("value"));
 
-  pystate_cls.def(
-    "add_summary",
-    [](nvbench::state &state, std::string column_name, std::int64_t value) {
+  auto method_add_summary_int64_value_impl =
+    [](nvbench::state &state, std::string column_name, nvbench::int64_t value) {
       auto &summ = state.add_summary("nv/python/" + column_name);
       summ.set_string("description", "User tag: " + column_name);
       summ.set_string("name", std::move(column_name));
       summ.set_int64("value", value);
-    },
-    py::arg("name"),
-    py::arg("value"));
+    };
+  pystate_cls.def("add_summary",
+                  method_add_summary_int64_value_impl,
+                  method_add_summary_doc,
+                  py::arg("name"),
+                  py::arg("value"));
 
-  pystate_cls.def(
-    "add_summary",
-    [](nvbench::state &state, std::string column_name, double value) {
+  auto method_add_summary_float64_value_impl =
+    [](nvbench::state &state, std::string column_name, nvbench::float64_t value) {
       auto &summ = state.add_summary("nv/python/" + column_name);
       summ.set_string("description", "User tag: " + column_name);
       summ.set_string("name", std::move(column_name));
       summ.set_float64("value", value);
-    },
-    py::arg("name"),
-    py::arg("value"));
+    };
+  pystate_cls.def("add_summary",
+                  method_add_summary_float64_value_impl,
+                  method_add_summary_doc,
+                  py::arg("name"),
+                  py::arg("value"));
 
+  // method State.get_axis_values_as_string
+  static constexpr const char *method_get_axis_values_as_string_doc = R"XXXX(
+Get string of space-separated name=value pairs for this configuration
+)XXXX";
   pystate_cls.def("get_axis_values_as_string",
-                  [](const nvbench::state &state) { return state.get_axis_values_as_string(); });
-  pystate_cls.def("get_axis_values", &py_get_axis_values);
-  pystate_cls.def("get_stopping_criterion", &nvbench::state::get_stopping_criterion);
+                  &nvbench::state::get_axis_values_as_string,
+                  method_get_axis_values_as_string_doc);
+
+  // method State.get_axis_values
+  static constexpr const char *method_get_axis_values_doc = R"XXXX(
+Get dictionary with axis values for this configuration
+)XXXX";
+  pystate_cls.def("get_axis_values", &py_get_axis_values, method_get_axis_values_doc);
+
+  // method State.get_stopping_criterion
+  static constexpr const char *method_get_stopping_criterion_doc = R"XXXX(
+Get string name of the stopping criterion used
+)XXXX";
+  pystate_cls.def("get_stopping_criterion",
+                  &nvbench::state::get_stopping_criterion,
+                  method_get_stopping_criterion_doc);
+}
+
+} // namespace
+
+// ==========================================
+// PLEASE KEEP IN SYNC WITH __init__.pyi FILE
+// ==========================================
+// If you modify these bindings, please be sure to update the
+// corresponding type hints in ``../cuda/nvbench/__init__.pyi``
+
+PYBIND11_MODULE(_nvbench, m)
+{
+  NVBENCH_DRIVER_API_CALL(cuInit(0));
+
+  // This line ensures that benchmark_manager has been created during module init
+  // It is reinitialized before running all benchmarks to set devices to use
+  nvbench::benchmark_manager::get().initialize();
+
+  def_class_CudaStream(m);
+
+  def_class_Launch(m);
+
+  def_class_Benchmark(m);
+
+  def_class_State(m);
 
   // Use handle to take a memory leak here, since this object's destructor may be called after
   // interpreter has shut down
-  benchmark_exc =
-    py::exception<nvbench_run_error>(m, "NVBenchRuntimeError", PyExc_RuntimeError).release();
-  // == STEP 6
-  //    ATTN: nvbench::benchmark_manager is a singleton
+  static constexpr const char *exception_nvbench_runtime_error_doc = R"XXXX(
+An exception raised if running benchmarks encounters an error
+)XXXX";
+  py::object benchmark_exc_ =
+    py::exception<nvbench_run_error>(m, "NVBenchRuntimeError", PyExc_RuntimeError);
+  benchmark_exc_.attr("__doc__") = exception_nvbench_runtime_error_doc;
+
+  benchmark_exc = benchmark_exc_.release();
 
+  // ATTN: nvbench::benchmark_manager is a singleton, it is exposed through
+  // GlobalBenchmarkRegistry class
   global_registry =
     std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete>(new GlobalBenchmarkRegistry(),
                                                            py::nodelete{});
 
-  m.def(
-    "register",
-    [&](py::object fn) { return std::ref(global_registry->add_bench(fn)); },
-    "Register benchmark function of type Callable[[nvbench.State], None]",
-    py::return_value_policy::reference,
-    py::arg("benchmark_fn"));
-
-  m.def(
-    "run_all_benchmarks",
-    [&](py::object argv) -> void {
-      if (!py::isinstance<py::list>(argv))
-      {
-        throw py::type_error("run_all_benchmarks expects a list of command-line arguments");
-      }
-      std::vector<std::string> args = py::cast<std::vector<std::string>>(argv);
-      global_registry->run(args);
-    },
-    "Run all registered benchmarks",
-    py::arg("argv") = py::list());
-
+  // function register
+  auto func_register_impl = [&](py::object fn) { return std::ref(global_registry->add_bench(fn)); };
+  static constexpr const char *func_register_doc = R"XXXX(
+Register benchmark function of type Callable[[nvbench.State], None]
+)XXXX";
+  m.def("register",
+        func_register_impl,
+        func_register_doc,
+        py::return_value_policy::reference,
+        py::arg("benchmark_fn"));
+
+  // function run_all_benchmarks
+  auto func_run_all_benchmarks_impl = [&](py::object argv) -> void {
+    if (!py::isinstance<py::list>(argv))
+    {
+      throw py::type_error("run_all_benchmarks expects a list of command-line arguments");
+    }
+    std::vector<std::string> args = py::cast<std::vector<std::string>>(argv);
+    global_registry->run(args);
+  };
+  static constexpr const char *func_run_all_benchmarks_doc = R"XXXX(
+    Run all benchmarks registered with NVBench.
+
+    Parameters
+    ----------
+    argv: List[str]
+        Sequence of CLI arguments controlling NVBench. Usually, it is `sys.argv`.
+)XXXX";
+  m.def("run_all_benchmarks",
+        func_run_all_benchmarks_impl,
+        func_run_all_benchmarks_doc,
+        py::arg("argv") = py::list());
+
+  // Testing utilities
   m.def("test_cpp_exception", []() { throw nvbench_run_error("Test"); });
   m.def("test_py_exception", []() {
     py::set_error(benchmark_exc, "Test");
diff --git a/python/test/test_nvbench.py b/python/test/test_nvbench.py
index 5604a3f2..7d927e8f 100644
--- a/python/test/test_nvbench.py
+++ b/python/test/test_nvbench.py
@@ -37,3 +37,54 @@ def test_cpu_only():
     b.set_is_cpu_only(True)
 
     bench.run_all_benchmarks(["-q", "--profile"])
+
+
+def docstring_check(doc_str: str) -> None:
+    assert isinstance(doc_str, str)
+    assert len(doc_str) > 0
+
+
+def obj_has_docstring_check(o: object) -> None:
+    docstring_check(o.__doc__)
+
+
+def test_module_doc():
+    obj_has_docstring_check(bench)
+
+
+def test_register_doc():
+    obj_has_docstring_check(bench.register)
+
+
+def test_run_all_benchmarks_doc():
+    obj_has_docstring_check(bench.run_all_benchmarks)
+
+
+def test_State_doc():
+    cl = bench.State
+    obj_has_docstring_check(cl)
+    obj_has_docstring_check(cl.exec)
+    obj_has_docstring_check(cl.get_int64)
+    obj_has_docstring_check(cl.get_float64)
+    obj_has_docstring_check(cl.get_string)
+    obj_has_docstring_check(cl.skip)
+
+
+def test_Launch_doc():
+    cl = bench.Launch
+    obj_has_docstring_check(cl)
+    obj_has_docstring_check(cl.get_stream)
+
+
+def test_CudaStream_doc():
+    cl = bench.CudaStream
+    obj_has_docstring_check(cl)
+
+
+def test_Benchmark_doc():
+    cl = bench.Benchmark
+    obj_has_docstring_check(cl)
+    obj_has_docstring_check(cl.add_int64_axis)
+    obj_has_docstring_check(cl.add_int64_power_of_two_axis)
+    obj_has_docstring_check(cl.add_float64_axis)
+    obj_has_docstring_check(cl.add_string_axis)

From 6a8bac520e069e804cda1a6122d23044819e51ea Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 9 Dec 2025 14:02:42 -0600
Subject: [PATCH 2/2] Replace use of py::handle to store global_registry

Use py::gil_safe_call_once_and_store facility pybind11 provides.
---
 python/src/py_nvbench.cpp | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index e01a33ce..39667df4 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -124,7 +124,8 @@ struct nvbench_run_error : std::runtime_error
   // that are defined for the base class
   using std::runtime_error::runtime_error;
 };
-py::handle benchmark_exc{};
+
+PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store<py::object> exc_storage;
 
 void run_interruptible(nvbench::option_parser &parser)
 {
@@ -223,18 +224,18 @@ class GlobalBenchmarkRegistry
     }
     catch (py::error_already_set &e)
     {
-      py::raise_from(e, benchmark_exc.ptr(), "Python error raised ");
+      py::raise_from(e, exc_storage.get_stored().ptr(), "Python error raised ");
       throw py::error_already_set();
     }
     catch (const std::exception &e)
     {
       const std::string &exc_message = e.what();
-      py::set_error(benchmark_exc, exc_message.c_str());
+      py::set_error(exc_storage.get_stored(), exc_message.c_str());
       throw py::error_already_set();
     }
     catch (...)
     {
-      py::set_error(benchmark_exc, "Caught unknown exception in nvbench_main");
+      py::set_error(exc_storage.get_stored(), "Caught unknown exception in nvbench_main");
       throw py::error_already_set();
     }
   }
@@ -1158,11 +1159,12 @@ PYBIND11_MODULE(_nvbench, m)
   static constexpr const char *exception_nvbench_runtime_error_doc = R"XXXX(
 An exception raised if running benchmarks encounters an error
 )XXXX";
-  py::object benchmark_exc_ =
-    py::exception<nvbench_run_error>(m, "NVBenchRuntimeError", PyExc_RuntimeError);
-  benchmark_exc_.attr("__doc__") = exception_nvbench_runtime_error_doc;
-
-  benchmark_exc = benchmark_exc_.release();
+  exc_storage.call_once_and_store_result([&]() {
+    py::object benchmark_exc_ =
+      py::exception<nvbench_run_error>(m, "NVBenchRuntimeError", PyExc_RuntimeError);
+    benchmark_exc_.attr("__doc__") = exception_nvbench_runtime_error_doc;
+    return benchmark_exc_;
+  });
 
   // ATTN: nvbench::benchmark_manager is a singleton, it is exposed through
   // GlobalBenchmarkRegistry class
@@ -1171,7 +1173,7 @@ An exception raised if running benchmarks encounters an error
                                                            py::nodelete{});
 
   // function register
-  auto func_register_impl = [&](py::object fn) { return std::ref(global_registry->add_bench(fn)); };
+  auto func_register_impl = [](py::object fn) { return std::ref(global_registry->add_bench(fn)); };
   static constexpr const char *func_register_doc = R"XXXX(
 Register benchmark function of type Callable[[nvbench.State], None]
 )XXXX";
@@ -1206,7 +1208,7 @@ Register benchmark function of type Callable[[nvbench.State], None]
   // Testing utilities
   m.def("test_cpp_exception", []() { throw nvbench_run_error("Test"); });
   m.def("test_py_exception", []() {
-    py::set_error(benchmark_exc, "Test");
+    py::set_error(exc_storage.get_stored(), "Test");
     throw py::error_already_set();
   });
 }