diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py index fbea314406..c2eb200de5 100644 --- a/cuda_core/cuda/core/_module.py +++ b/cuda_core/cuda/core/_module.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +import threading import weakref from collections import namedtuple from typing import Union @@ -17,6 +18,14 @@ ) from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return, precondition +# Lazy initialization state and synchronization +# For Python 3.13t (free-threaded builds), we use a lock to ensure thread-safe initialization. +# For regular Python builds with GIL, the lock overhead is minimal and the code remains safe. +_init_lock = threading.Lock() +_inited = False +_py_major_ver = None +_driver_ver = None +_kernel_ctypes = None _backend = { "old": { "file": driver.cuModuleLoad, @@ -27,35 +36,75 @@ } -# TODO: revisit this treatment for py313t builds -_inited = False -_py_major_ver = None -_driver_ver = None -_kernel_ctypes = None +def _lazy_init(): + """ + Initialize module-level state in a thread-safe manner. + This function is thread-safe and suitable for both: + - Regular Python builds (with GIL) + - Python 3.13t free-threaded builds (without GIL) -def _lazy_init(): + Uses double-checked locking pattern for performance: + - Fast path: check without lock if already initialized + - Slow path: acquire lock and initialize if needed + """ global _inited + # Fast path: already initialized (no lock needed for read) if _inited: return - global _py_major_ver, _driver_ver, _kernel_ctypes - # binding availability depends on cuda-python version - _py_major_ver, _ = get_binding_version() - if _py_major_ver >= 12: - _backend["new"] = { - "file": driver.cuLibraryLoadFromFile, - "data": driver.cuLibraryLoadData, - "kernel": driver.cuLibraryGetKernel, - "attribute": driver.cuKernelGetAttribute, - } - _kernel_ctypes = (driver.CUfunction, driver.CUkernel) - else: - _kernel_ctypes = (driver.CUfunction,) - _driver_ver = handle_return(driver.cuDriverGetVersion()) - if _py_major_ver >= 12 and _driver_ver >= 12040: - _backend["new"]["paraminfo"] = driver.cuKernelGetParamInfo - _inited = True + # Slow path: acquire lock and initialize + with _init_lock: + # Double-check: another thread might have initialized while we waited + if _inited: + return + + global _py_major_ver, _driver_ver, _kernel_ctypes + # binding availability depends on cuda-python version + _py_major_ver, _ = get_binding_version() + if _py_major_ver >= 12: + _backend["new"] = { + "file": driver.cuLibraryLoadFromFile, + "data": driver.cuLibraryLoadData, + "kernel": driver.cuLibraryGetKernel, + "attribute": driver.cuKernelGetAttribute, + } + _kernel_ctypes = (driver.CUfunction, driver.CUkernel) + else: + _kernel_ctypes = (driver.CUfunction,) + _driver_ver = handle_return(driver.cuDriverGetVersion()) + if _py_major_ver >= 12 and _driver_ver >= 12040: + _backend["new"]["paraminfo"] = driver.cuKernelGetParamInfo + + # Mark as initialized (must be last to ensure all state is set) + _inited = True + + +# Auto-initializing property accessors +def _get_py_major_ver(): + """Get the Python binding major version, initializing if needed.""" + _lazy_init() + return _py_major_ver + + +def _get_driver_ver(): + """Get the CUDA driver version, initializing if needed.""" + _lazy_init() + return _driver_ver + + +def _get_kernel_ctypes(): + """Get the kernel ctypes tuple, initializing if needed.""" + _lazy_init() + return _kernel_ctypes + + +def _get_backend_version(): + """Get the backend version ("new" or "old") based on CUDA version. + + Returns "new" for CUDA 12.0+ (uses cuLibrary API), "old" otherwise (uses cuModule API). + """ + return "new" if (_get_py_major_ver() >= 12 and _get_driver_ver() >= 12000) else "old" class KernelAttributes: @@ -70,7 +119,7 @@ def _init(cls, kernel): self._kernel = weakref.ref(kernel) self._cache = {} - self._backend_version = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old" + self._backend_version = _get_backend_version() self._loader = _backend[self._backend_version] return self @@ -197,7 +246,9 @@ def cluster_scheduling_policy_preference(self, device_id: Device | int = None) - class KernelOccupancy: - """ """ + """This class offers methods to query occupancy metrics that help determine optimal + launch parameters such as block size, grid size, and shared memory usage. + """ def __new__(self, *args, **kwargs): raise RuntimeError("KernelOccupancy cannot be instantiated directly. Please use Kernel APIs.") @@ -378,7 +429,7 @@ def __new__(self, *args, **kwargs): @classmethod def _from_obj(cls, obj, mod): - assert_type(obj, _kernel_ctypes) + assert_type(obj, _get_kernel_ctypes()) assert_type(mod, ObjectCode) ker = super().__new__(cls) ker._handle = obj @@ -399,9 +450,10 @@ def _get_arguments_info(self, param_info=False) -> tuple[int, list[ParamInfo]]: if attr_impl._backend_version != "new": raise NotImplementedError("New backend is required") if "paraminfo" not in attr_impl._loader: + driver_ver = _get_driver_ver() raise NotImplementedError( "Driver version 12.4 or newer is required for this function. " - f"Using driver version {_driver_ver // 1000}.{(_driver_ver % 1000) // 10}" + f"Using driver version {driver_ver // 1000}.{(driver_ver % 1000) // 10}" ) arg_pos = 0 param_info_data = [] @@ -436,7 +488,43 @@ def occupancy(self) -> KernelOccupancy: self._occupancy = KernelOccupancy._init(self._handle) return self._occupancy - # TODO: implement from_handle() + @staticmethod + def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel": + """Creates a new :obj:`Kernel` object from a foreign kernel handle. + + Uses a CUfunction or CUkernel pointer address to create a new :obj:`Kernel` object. + + Parameters + ---------- + handle : int + Kernel handle representing the address of a foreign + kernel object (CUfunction or CUkernel). + mod : :obj:`ObjectCode`, optional + The ObjectCode object associated with this kernel. If not provided, + a placeholder ObjectCode will be created. Note that without a proper + ObjectCode, certain operations may be limited. + """ + + # Validate that handle is an integer + if not isinstance(handle, int): + raise TypeError(f"handle must be an integer, got {type(handle).__name__}") + + # Convert the integer handle to the appropriate driver type + if _get_py_major_ver() >= 12 and _get_driver_ver() >= 12000: + # Try CUkernel first for newer CUDA versions + kernel_obj = driver.CUkernel(handle) + else: + # Use CUfunction for older versions + kernel_obj = driver.CUfunction(handle) + + # If no module provided, create a placeholder + if mod is None: + # Create a placeholder ObjectCode that won't try to load anything + mod = ObjectCode._init(b"", "cubin") + # Set a dummy handle to prevent lazy loading + mod._handle = 1 # Non-null placeholder + + return Kernel._from_obj(kernel_obj, mod) CodeTypeT = Union[bytes, bytearray, str] @@ -474,12 +562,11 @@ def __new__(self, *args, **kwargs): def _init(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None = None): self = super().__new__(cls) assert code_type in self._supported_code_type, f"{code_type=} is not supported" - _lazy_init() # handle is assigned during _lazy_load self._handle = None - self._backend_version = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old" + self._backend_version = _get_backend_version() self._loader = _backend[self._backend_version] self._code_type = code_type @@ -605,6 +692,42 @@ def from_library(module: Union[bytes, str], *, name: str = "", symbol_mapping: d """ return ObjectCode._init(module, "library", name=name, symbol_mapping=symbol_mapping) + @staticmethod + def from_handle( + handle: int, code_type: str = "cubin", *, name: str = "", symbol_mapping: dict | None = None + ) -> "ObjectCode": + """Create a new :obj:`ObjectCode` object from a foreign module handle. + + Uses a CUmodule or CUlibrary pointer address to create a new :obj:`ObjectCode` object. + + Parameters + ---------- + handle : int + Module handle representing the address of a foreign + module object (CUmodule or CUlibrary). + code_type : str, optional + The type of code object this handle represents. Must be one of + "cubin", "ptx", "ltoir", "fatbin", "object", or "library". + (Default: "cubin") + name : str, optional + A human-readable identifier representing this code object. + symbol_mapping : dict, optional + A dictionary specifying how the unmangled symbol names (as keys) + should be mapped to the mangled names before trying to retrieve + them (default to no mappings). + """ + # Create an ObjectCode instance with a placeholder module + # The handle will be set directly, bypassing the lazy loading + obj = ObjectCode._init(b"", code_type, name=name, symbol_mapping=symbol_mapping) + + # Set the handle directly from the foreign handle + if obj._backend_version == "new": + obj._handle = driver.CUlibrary(handle) + else: + obj._handle = driver.CUmodule(handle) + + return obj + # TODO: do we want to unload in a finalizer? Probably not.. def _lazy_load_module(self, *args, **kwargs): diff --git a/cuda_core/cuda/core/_program.py b/cuda_core/cuda/core/_program.py index 121dd13963..d936ca0a26 100644 --- a/cuda_core/cuda/core/_program.py +++ b/cuda_core/cuda/core/_program.py @@ -860,3 +860,54 @@ def handle(self) -> ProgramHandleT: handle, call ``int(Program.handle)``. """ return self._mnff.handle + + @staticmethod + def from_handle(handle: int, backend: str, options: ProgramOptions = None) -> Program: + """Create a new :obj:`Program` object from a foreign program handle. + + Uses a nvrtcProgram or NVVM program handle represented as a Python int + to create a new :obj:`Program` object. + + Parameters + ---------- + handle : int + Program handle representing the address of a foreign + program object (nvrtcProgram for NVRTC backend, or + NVVM program handle for NVVM backend). + backend : str + The backend type of the program handle. Must be either + "NVRTC" or "NVVM". This determines how the handle is + interpreted and used. + options : :obj:`ProgramOptions`, optional + Program options that may be used for subsequent operations. + If not provided, default options will be created. + """ + backend = backend.upper() + if backend not in ("NVRTC", "NVVM"): + raise ValueError(f"Unsupported backend '{backend}'. Must be 'NVRTC' or 'NVVM'") + + # Create a new Program instance without going through __init__ + prog = object.__new__(Program) + + # Initialize the members needed for finalization + # Note: We pass None as the program_obj to avoid finalization since + # we don't own the handle + prog._mnff = Program._MembersNeededForFinalize.__new__(Program._MembersNeededForFinalize) + prog._mnff.handle = None # Don't manage the foreign handle + prog._mnff.backend = backend + + # Store the backend and options + prog._backend = backend + prog._options = check_or_create_options(ProgramOptions, options, "Program options") + prog._linker = None + + # Store the handle directly without taking ownership + # This means the finalizer won't destroy it + if backend == "NVRTC": + prog._mnff.handle = nvrtc.nvrtcProgram(handle) + elif backend == "NVVM": + # For NVVM, we just store the handle as-is + # The actual NVVM program handle is opaque + prog._mnff.handle = handle + + return prog diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 4b3817ece4..fc07fe7741 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -420,3 +420,238 @@ def test_module_serialization_roundtrip(get_saxpy_kernel_cubin): assert objcode.code == result.code assert objcode._sym_map == result._sym_map assert objcode.code_type == result.code_type + + +def test_object_code_from_handle(get_saxpy_kernel_cubin): + """Test ObjectCode.from_handle() with a valid handle""" + kernel, original_objcode = get_saxpy_kernel_cubin + + # Get the handle from the original object code + handle = int(original_objcode.handle) + + # Create a new ObjectCode from the handle + objcode_from_handle = ObjectCode.from_handle(handle, "cubin", symbol_mapping=original_objcode._sym_map) + assert isinstance(objcode_from_handle, ObjectCode) + assert objcode_from_handle.code_type == "cubin" + + # Try to get a kernel from the new object code + kernel_from_handle = objcode_from_handle.get_kernel("saxpy") + assert isinstance(kernel_from_handle, cuda.core._module.Kernel) + + +def test_object_code_from_handle_with_different_code_types(get_saxpy_kernel_ptx): + """Test ObjectCode.from_handle() with PTX code type""" + ptx, original_objcode = get_saxpy_kernel_ptx + + if not Program._can_load_generated_ptx(): + pytest.skip("PTX version too new for current driver") + + # Force loading to get a handle + _ = original_objcode.get_kernel("saxpy") + handle = int(original_objcode.handle) + + # Create a new ObjectCode from the handle with PTX code type + objcode_from_handle = ObjectCode.from_handle(handle, "ptx", symbol_mapping=original_objcode._sym_map) + assert isinstance(objcode_from_handle, ObjectCode) + assert objcode_from_handle.code_type == "ptx" + + +def test_kernel_from_handle(get_saxpy_kernel_cubin): + """Test Kernel.from_handle() with a valid handle""" + original_kernel, objcode = get_saxpy_kernel_cubin + + # Get the handle from the original kernel + handle = int(original_kernel._handle) + + # Create a new Kernel from the handle + kernel_from_handle = cuda.core._module.Kernel.from_handle(handle, objcode) + assert isinstance(kernel_from_handle, cuda.core._module.Kernel) + + # Verify we can access kernel attributes + max_threads = kernel_from_handle.attributes.max_threads_per_block() + assert isinstance(max_threads, int) + assert max_threads > 0 + + +def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin): + """Test Kernel.from_handle() without providing a module""" + original_kernel, _ = get_saxpy_kernel_cubin + + # Get the handle from the original kernel + handle = int(original_kernel._handle) + + # Create a new Kernel from the handle without a module + kernel_from_handle = cuda.core._module.Kernel.from_handle(handle) + assert isinstance(kernel_from_handle, cuda.core._module.Kernel) + + # Verify we can still access kernel attributes + max_threads = kernel_from_handle.attributes.max_threads_per_block() + assert isinstance(max_threads, int) + assert max_threads > 0 + + +# Edge case tests for from_handle methods + + +@pytest.mark.parametrize( + "invalid_code_type,expected_error", + [ + pytest.param("invalid_type", AssertionError, id="invalid_str"), + pytest.param("", AssertionError, id="empty_str"), + pytest.param(None, (AssertionError, TypeError), id="None"), + pytest.param(123, (AssertionError, TypeError), id="int"), + pytest.param(3.14, (AssertionError, TypeError), id="float"), + pytest.param(["cubin"], (AssertionError, TypeError), id="list"), + pytest.param(("cubin",), (AssertionError, TypeError), id="tuple"), + pytest.param({"type": "cubin"}, (AssertionError, TypeError), id="dict"), + pytest.param(b"cubin", (AssertionError, TypeError), id="bytes"), + pytest.param({"cubin"}, (AssertionError, TypeError), id="set"), + pytest.param(object(), (AssertionError, TypeError), id="object"), + ], +) +def test_object_code_from_handle_invalid_code_type(invalid_code_type, expected_error): + """Test ObjectCode.from_handle() with invalid code_type""" + with pytest.raises(expected_error): + ObjectCode.from_handle(0, invalid_code_type, symbol_mapping={}) + + +def test_object_code_from_handle_symbol_mapping_variations(): + """Test ObjectCode.from_handle() with various symbol_mapping values""" + # None symbol_mapping (should default to empty dict) + objcode1 = ObjectCode.from_handle(0, "cubin", symbol_mapping=None) + assert objcode1._sym_map == {} + + # Empty dict + objcode2 = ObjectCode.from_handle(0, "cubin", symbol_mapping={}) + assert objcode2._sym_map == {} + + # Valid symbol mapping + sym_map = {"kernel1": b"_Z7kernel1v", "kernel2": b"_Z7kernel2v"} + objcode3 = ObjectCode.from_handle(0, "cubin", symbol_mapping=sym_map) + assert objcode3._sym_map == sym_map + + +def test_object_code_from_handle_symbol_mapping_with_valid_handle(get_saxpy_kernel_cubin): + """Test that symbol_mapping is actually used when getting kernels""" + _, original_objcode = get_saxpy_kernel_cubin + original_handle = int(original_objcode.handle) + + # Create ObjectCode with correct symbol mapping + objcode_with_map = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) + + # Should successfully get kernel using unmangled name from symbol_mapping + kernel = objcode_with_map.get_kernel("saxpy") + assert isinstance(kernel, cuda.core._module.Kernel) + + # Create ObjectCode without symbol mapping + objcode_no_map = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping={}) + + # Should fail to get kernel using unmangled name (no mapping available) + with pytest.raises(CUDAError): + objcode_no_map.get_kernel("saxpy") + + +def test_object_code_from_handle_lifecycle(get_saxpy_kernel_cubin): + """Test handle lifecycle and ownership with from_handle""" + original_kernel, original_objcode = get_saxpy_kernel_cubin + + # Get the original handle + original_handle = int(original_objcode.handle) + + # Create a new ObjectCode from the same handle + objcode_from_handle = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) + + # Both should reference the same underlying CUDA module + assert int(objcode_from_handle.handle) == original_handle + + # Get a kernel from the from_handle version + kernel_from_copy = objcode_from_handle.get_kernel("saxpy") + assert isinstance(kernel_from_copy, cuda.core._module.Kernel) + + # The original should still work + kernel_from_original = original_objcode.get_kernel("saxpy") + assert isinstance(kernel_from_original, cuda.core._module.Kernel) + + # Both kernels should reference the same underlying CUDA kernel handle + # If handles are equal, they're the same kernel - no need to check attributes + assert int(kernel_from_copy._handle) == int(kernel_from_original._handle) + + +def test_object_code_from_handle_multiple_instances(get_saxpy_kernel_cubin): + """Test creating multiple ObjectCode instances from the same handle""" + original_kernel, original_objcode = get_saxpy_kernel_cubin + + # Get the original handle + original_handle = int(original_objcode.handle) + + # Create multiple ObjectCode instances from the same handle + objcode1 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) + objcode2 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) + objcode3 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) + + # All should have the same handle + assert int(objcode1.handle) == original_handle + assert int(objcode2.handle) == original_handle + assert int(objcode3.handle) == original_handle + + # All should be able to get kernels + kernel1 = objcode1.get_kernel("saxpy") + kernel2 = objcode2.get_kernel("saxpy") + kernel3 = objcode3.get_kernel("saxpy") + + assert isinstance(kernel1, cuda.core._module.Kernel) + assert isinstance(kernel2, cuda.core._module.Kernel) + assert isinstance(kernel3, cuda.core._module.Kernel) + + +@pytest.mark.parametrize( + "invalid_value", + [ + pytest.param("not_an_int", id="str"), + pytest.param(2.71828, id="float"), + pytest.param(None, id="None"), + pytest.param({"handle": 123}, id="dict"), + pytest.param([456], id="list"), + pytest.param((789,), id="tuple"), + pytest.param(3 + 4j, id="complex"), + pytest.param(b"\xde\xad\xbe\xef", id="bytes"), + pytest.param({999}, id="set"), + pytest.param(object(), id="object"), + ], +) +def test_kernel_from_handle_type_validation(invalid_value): + """Test Kernel.from_handle() with wrong handle types""" + with pytest.raises(TypeError): + cuda.core._module.Kernel.from_handle(invalid_value) + + +def test_kernel_from_handle_invalid_module_type(get_saxpy_kernel_cubin): + """Test Kernel.from_handle() with invalid module parameter""" + original_kernel, _ = get_saxpy_kernel_cubin + handle = int(original_kernel._handle) + + # Invalid module type (should fail type assertion in _from_obj) + with pytest.raises((TypeError, AssertionError)): + cuda.core._module.Kernel.from_handle(handle, mod="not_an_objectcode") + + with pytest.raises((TypeError, AssertionError)): + cuda.core._module.Kernel.from_handle(handle, mod=12345) + + +def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin): + """Test creating multiple Kernel instances from the same handle""" + original_kernel, objcode = get_saxpy_kernel_cubin + handle = int(original_kernel._handle) + + # Create multiple Kernel instances from the same handle + kernel1 = cuda.core._module.Kernel.from_handle(handle, objcode) + kernel2 = cuda.core._module.Kernel.from_handle(handle, objcode) + kernel3 = cuda.core._module.Kernel.from_handle(handle, objcode) + + # All should be valid Kernel objects + assert isinstance(kernel1, cuda.core._module.Kernel) + assert isinstance(kernel2, cuda.core._module.Kernel) + assert isinstance(kernel3, cuda.core._module.Kernel) + + # All should reference the same underlying CUDA kernel handle + assert int(kernel1._handle) == int(kernel2._handle) == int(kernel3._handle) == handle diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 9a9e4926ae..105595839a 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -547,3 +547,46 @@ def test_program_options_as_bytes_nvvm_unsupported_option(): options = ProgramOptions(arch="sm_80", lineinfo=True) with pytest.raises(CUDAError, match="not supported by NVVM backend"): options.as_bytes("nvvm") + + +def test_program_from_handle_nvrtc(init_cuda): + """Test Program.from_handle() with NVRTC backend""" + # Create a regular program to get a handle + code = 'extern "C" __global__ void test_kernel() {}' + original_program = Program(code, "c++") + assert original_program.backend == "NVRTC" + + # Get the handle + handle = int(original_program.handle) + + # Create a new program from the handle + program_from_handle = Program.from_handle(handle, "NVRTC") + assert program_from_handle.backend == "NVRTC" + # Note: We don't own the handle, so we shouldn't close it in the from_handle instance + + # Clean up the original program + original_program.close() + + +@nvvm_available +def test_program_from_handle_nvvm(init_cuda, nvvm_ir): + """Test Program.from_handle() with NVVM backend""" + # Create a regular NVVM program to get a handle + original_program = Program(nvvm_ir, "nvvm") + assert original_program.backend == "NVVM" + + # Get the handle + handle = int(original_program.handle) + + # Create a new program from the handle + program_from_handle = Program.from_handle(handle, "NVVM") + assert program_from_handle.backend == "NVVM" + + # Clean up the original program + original_program.close() + + +def test_program_from_handle_invalid_backend(): + """Test Program.from_handle() with invalid backend""" + with pytest.raises(ValueError, match="Unsupported backend 'INVALID'"): + Program.from_handle(0, "INVALID")