From 0dc9af988404a2f5b41bdd8cba3255d57091cc8d Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 1 May 2025 12:12:41 +0200 Subject: [PATCH 1/4] Squashed commit of the following: commit 635da6c4e64fc983397eb2f90af4f102eb92b286 Author: Ioannis Magkanaris Date: Fri Apr 25 15:40:25 2025 +0200 Add NVTX range in CUDA GPU kernel call of program --- dace/codegen/CMakeLists.txt | 2 +- dace/codegen/targets/framecode.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/dace/codegen/CMakeLists.txt b/dace/codegen/CMakeLists.txt index 5482d4d30d..7d1ca4d714 100644 --- a/dace/codegen/CMakeLists.txt +++ b/dace/codegen/CMakeLists.txt @@ -141,7 +141,7 @@ if(DACE_ENABLE_CUDA) set(CMAKE_CUDA_ARCHITECTURES "${LOCAL_CUDA_ARCHITECTURES}") enable_language(CUDA) - list(APPEND DACE_LIBS CUDA::cudart) + list(APPEND DACE_LIBS CUDA::cudart CUDA::nvtx3) add_definitions(-DWITH_CUDA) if (MSVC_IDE) diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 33bc562f73..7a62170c3d 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -140,6 +140,8 @@ def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream, backend: if backend == 'frame': global_stream.write('#include "../../include/hash.h"\n', sdfg) + global_stream.write('#ifdef WITH_CUDA\n#include \n#endif\n', sdfg) + ######################################################### # Environment-based includes for env in self.environments: @@ -266,7 +268,13 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre f''' DACE_EXPORTED void __program_{fname}({mangle_dace_state_struct_name(fname)} *__state{params_comma}) {{ + #ifdef WITH_CUDA + nvtxRangePushA("{fname}"); + #endif __program_{fname}_internal(__state{paramnames_comma}); + #ifdef WITH_CUDA + nvtxRangePop(); + #endif }}''', sdfg) for target in self._dispatcher.used_targets: From d495495ce44773358366c82f48aa39fe23391bd9 Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 1 May 2025 12:13:11 +0200 Subject: [PATCH 2/4] Squashed commit of the following: commit aef99452609f49f3d0d00f911a4b89f9a25f191f Author: Philip Mueller Date: Tue Mar 25 06:52:35 2025 +0100 As an experiment removed some code I think is useless, let's see what the tests say. commit e5bf87f0ec0252857702d8ffded92dd87d2040b1 Author: Philip Mueller Date: Mon Mar 24 07:05:18 2025 +0100 Added a comment to address the possible issues with viewes. commit ba978742791c274e4f7bd79c506e7c6b392034e6 Merge: b0b994551 4245396e4 Author: Philip Mueller Date: Fri Mar 21 16:07:12 2025 +0100 Merge remote-tracking branch 'spcl/main' into improved-2d-copy commit b0b994551bab28b1a7ca32d072a6d983aac71277 Author: Philip Mueller Date: Fri Mar 21 16:06:09 2025 +0100 Added Alexnicks's suggestions. commit 065e0d7456f74e1f8b9c420fb20325dc4f1fa490 Author: Philip Mueller Date: Mon Mar 17 08:56:39 2025 +0100 Added tests to ensure that the new verification works as expected. commit 51182e58da7773dafd62694935df281ff1ee083a Author: Philip Mueller Date: Mon Mar 17 08:14:14 2025 +0100 Moved the test for negative sized subsets from the Memlet to the `vaidate_state()` function. The reason is that in some cases this is valid, for example if an edge connects an AccessNode and a MapEntry, because, in that case the map might not be executed. Since the Memlet does not have access to its source and destination node it can not check that, so the test was moved to a location that can do this check. However, it only does the check for AN to AN connections, which is a bit restrictive, but this is something for later. commit 2801967a71cb71a902e30973fd19acf5c566e423 Author: Philip Mueller Date: Mon Mar 17 07:41:51 2025 +0100 I am not sure why the printout of the edge is not correct, but it is not where I though I found it. commit 3166302f30240db6332c645eb40903d1dea78938 Author: Philip Mueller Date: Sat Mar 15 08:23:21 2025 +0100 Fixed some issue and made it more logical. commit 02d87b55476569f839420a1c7b62dc3e364b9173 Merge: 801adb105 d13079215 Author: Philip Mueller Date: Sat Mar 15 08:18:08 2025 +0100 Merge remote-tracking branch 'spcl/main' into improved-2d-copy commit 801adb105b0f758b727b50ddf425ec3158a8f4da Author: Philip Mueller Date: Sat Mar 15 08:15:45 2025 +0100 Added more verification. commit 66b43f869b929ac969c75206fb2ce58a9d706059 Author: Philip Mueller Date: Sat Mar 15 07:57:03 2025 +0100 Simplified some check. commit 76a1a58e5c61bfc09f97fbb56d6aff0d20d8f94f Author: Philip Mueller Date: Fri Mar 14 15:20:49 2025 +0100 Added a new test for the pseudo 1d case, i.e. when we reduce a copy 2D copy to a 1d copy, because it happens to be continiously allocated. commit 0b15a7407f1789e4fd9500650187e858ba394fe7 Author: Philip Mueller Date: Fri Mar 14 14:59:03 2025 +0100 Added a note about wrong usage of eid in validation. commit 322ecda119300f7402e53b81e7a340dbd3ba5aed Author: Philip Mueller Date: Fri Mar 14 14:52:28 2025 +0100 Improved memlet checking. commit 61ea7a6874d7c1b884b918413bbed84ca6b5259d Author: Philip Mueller Date: Fri Mar 14 14:43:27 2025 +0100 Added a new test to the SDFG. commit a67ad2a6d415d8f3431b22bb918d8ee3fba78f21 Author: Philip Mueller Date: Fri Mar 14 14:36:02 2025 +0100 Added now also test for testing strided 1d copy. commit c931b9193c86339048c65cbbe69887b9b33aba9e Author: Philip Mueller Date: Fri Mar 14 14:07:16 2025 +0100 Now 2D copies works, more tests needed. commit d0a396f929276be5a5994c8afb4dd1b465e11983 Author: Philip Mueller Date: Fri Mar 14 13:42:18 2025 +0100 Updated the memlet copying, I think I now have all the cases will now make the tests. commit 9b49c9e36c45a5b295c9042031e379724342ebda Author: Philip Mueller Date: Fri Mar 14 11:16:12 2025 +0100 Made a first version of the new copy implementation. But I have to run the unit tests. commit feea97f46221773479bc8b8e73801e8fbe49abbe Author: Philip Mueller Date: Fri Mar 14 10:18:08 2025 +0100 Started with the implementation of a better copy, but I have to fix it more. --- dace/codegen/targets/cuda.py | 148 ++++++---- dace/data.py | 16 +- dace/memlet.py | 3 + dace/sdfg/validation.py | 24 +- tests/codegen/cuda_memcopy_test.py | 323 ++++++++++++++++++++++ tests/sdfg/validation/subset_size_test.py | 83 ++++++ 6 files changed, 539 insertions(+), 58 deletions(-) create mode 100644 tests/sdfg/validation/subset_size_test.py diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index aaba068da3..7ea1b582a6 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -161,35 +161,40 @@ def preprocess(self, sdfg: SDFG) -> None: nsdfg = state.parent if (e.src.desc(nsdfg).storage == dtypes.StorageType.GPU_Global and e.dst.desc(nsdfg).storage == dtypes.StorageType.GPU_Global): + + # NOTE: If possible `memlet_copy_to_absolute_strides()` will collapse a + # ND copy into a 1D copy if the memory is contiguous. In that case + # `copy_shape` will only have one element. copy_shape, src_strides, dst_strides, _, _ = memlet_copy_to_absolute_strides( None, nsdfg, state, e, e.src, e.dst) dims = len(copy_shape) # Skip supported copy types if dims == 1: + # NOTE: We do not check if the stride is `1`. See `_emit_copy()` for more. continue elif dims == 2: - if src_strides[-1] != 1 or dst_strides[-1] != 1: - # NOTE: Special case of continuous copy - # Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J] - # with copy shape [I, J] and strides [J*K, K], [J, 1] - try: - is_src_cont = src_strides[0] / src_strides[1] == copy_shape[1] - is_dst_cont = dst_strides[0] / dst_strides[1] == copy_shape[1] - except (TypeError, ValueError): - is_src_cont = False - is_dst_cont = False - if is_src_cont and is_dst_cont: - continue - else: + # Because `memlet_copy_to_absolute_strides()` handles contiguous copies + # transparently, we only have to check if we have FORTRAN or C order. + # If we do not have them, then we have to turn this into a Map. + is_fortran_order = src_strides[0] == 1 and dst_strides[0] == 1 + is_c_order = src_strides[-1] == 1 and dst_strides[-1] == 1 + if is_c_order or is_fortran_order: continue elif dims > 2: - if not (src_strides[-1] != 1 or dst_strides[-1] != 1): + # Any higher dimensional copies must be C order. If not turn it + # into a copy map. + if src_strides[-1] == 1 and dst_strides[-1] == 1: continue # Turn unsupported copy to a map try: - CopyToMap.apply_to(nsdfg, save=False, annotate=False, a=e.src, b=e.dst) + CopyToMap.apply_to(nsdfg, + save=False, + annotate=False, + a=e.src, + b=e.dst, + options={"ignore_strides": True}) except ValueError: # If transformation doesn't match, continue normally continue @@ -973,32 +978,21 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St copy_shape, src_strides, dst_strides, src_expr, dst_expr = (memlet_copy_to_absolute_strides( self._dispatcher, sdfg, state_dfg, edge, src_node, dst_node, self._cpu_codegen._packed_types)) dims = len(copy_shape) - dtype = dst_node.desc(sdfg).dtype - # Handle unsupported copy types - if dims == 2 and (src_strides[-1] != 1 or dst_strides[-1] != 1): - # NOTE: Special case of continuous copy - # Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J] - # with copy shape [I, J] and strides [J*K, K], [J, 1] - try: - is_src_cont = src_strides[0] / src_strides[1] == copy_shape[1] - is_dst_cont = dst_strides[0] / dst_strides[1] == copy_shape[1] - except (TypeError, ValueError): - is_src_cont = False - is_dst_cont = False - if is_src_cont and is_dst_cont: - dims = 1 - copy_shape = [copy_shape[0] * copy_shape[1]] - src_strides = [src_strides[1]] - dst_strides = [dst_strides[1]] - else: - raise NotImplementedError('2D copy only supported with one stride') + # In 1D there is no difference between FORTRAN or C order, thus we will set them + # to the same value. The value indicates if the stride is `1` + # TODO: Figuring out if this is enough for views. + is_fortran_order = src_strides[0] == 1 and dst_strides[0] == 1 + is_c_order = src_strides[-1] == 1 and dst_strides[-1] == 1 - # Currently we only support ND copies when they can be represented - # as a 1D copy or as a 2D strided copy if dims > 2: - if src_strides[-1] != 1 or dst_strides[-1] != 1: + # Currently we only support ND copies when they can be represented + # as a 1D copy or as a 2D strided copy + # NOTE: Not sure if this test is enough, it should also be tested that + # they are ordered, i.e. largest stride on the left. + if not is_c_order: + # TODO: Implement the FORTRAN case. raise NotImplementedError( 'GPU copies are not supported for N-dimensions if they cannot be represented by a strided copy\n' f' Nodes: src {src_node} ({src_storage}), dst {dst_node}({dst_storage})\n' @@ -1026,7 +1020,8 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St for d in range(dims - 2): callsite_stream.write("}") - if dims == 1 and not (src_strides[-1] != 1 or dst_strides[-1] != 1): + elif dims == 1 and is_c_order: + # A 1D copy, in which the stride is 1, known at code generation time. copysize = ' * '.join(_topy(copy_shape)) array_length = copysize copysize += ' * sizeof(%s)' % dtype.ctype @@ -1064,22 +1059,70 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St backend=self.backend), cfg, state_id, [src_node, dst_node]) callsite_stream.write('}') - elif dims == 1 and ((src_strides[-1] != 1 or dst_strides[-1] != 1)): + + elif dims == 1 and not is_c_order: + # This is the case that generated for expressions such as `A[::3]`, we reduce it + # to a 2D copy. + callsite_stream.write( + 'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dst_stride}, {src}, {src_stride}, {width}, {height}, {kind}, {stream}));\n' + .format( + backend=self.backend, + dst=dst_expr, + dst_stride=f'({_topy(dst_strides[0])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})', + src=src_expr, + src_stride=f'({sym2cpp(src_strides[0])}) * sizeof({src_node.desc(sdfg).dtype.ctype})', + width=f'sizeof({dst_node.desc(sdfg).dtype.ctype})', + height=sym2cpp(copy_shape[0]), + kind=f'{self.backend}Memcpy{src_location}To{dst_location}', + stream=cudastream, + ), + cfg, + state_id, + [src_node, dst_node], + ) + + elif dims == 2 and is_c_order: + # Copying a 2D array that are in C order, i.e. last stride is 1. callsite_stream.write( - 'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' % - (self.backend, dst_expr, _topy(dst_strides[0]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, - src_expr, sym2cpp(src_strides[0]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype, - 'sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, sym2cpp( - copy_shape[0]), self.backend, src_location, dst_location, cudastream), cfg, state_id, - [src_node, dst_node]) - elif dims == 2: + 'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dst_stride}, {src}, {src_stride}, {width}, {height}, {kind}, {stream}));\n' + .format( + backend=self.backend, + dst=dst_expr, + dst_stride=f'({_topy(dst_strides[0])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})', + src=src_expr, + src_stride=f'({sym2cpp(src_strides[0])}) * sizeof({src_node.desc(sdfg).dtype.ctype})', + width=f'({sym2cpp(copy_shape[1])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})', + height=sym2cpp(copy_shape[0]), + kind=f'{self.backend}Memcpy{src_location}To{dst_location}', + stream=cudastream, + ), + cfg, + state_id, + [src_node, dst_node], + ) + elif dims == 2 and is_fortran_order: + # Copying a 2D array into a 2D array that is in FORTRAN order, i.e. first stride + # is one. The CUDA API can not handle such cases directly, however, by "transposing" + # it is possible to use `Memcyp2DAsync`. callsite_stream.write( - 'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' % - (self.backend, dst_expr, _topy(dst_strides[0]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, - src_expr, sym2cpp(src_strides[0]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype, - sym2cpp(copy_shape[1]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, sym2cpp( - copy_shape[0]), self.backend, src_location, dst_location, cudastream), cfg, state_id, - [src_node, dst_node]) + 'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dst_stride}, {src}, {src_stride}, {width}, {height}, {kind}, {stream}));\n' + .format( + backend=self.backend, + dst=dst_expr, + dst_stride=f'({_topy(dst_strides[1])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})', + src=src_expr, + src_stride=f'({sym2cpp(src_strides[1])}) * sizeof({src_node.desc(sdfg).dtype.ctype})', + width=f'({sym2cpp(copy_shape[0])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})', + height=sym2cpp(copy_shape[1]), + kind=f'{self.backend}Memcpy{src_location}To{dst_location}', + stream=cudastream, + ), + cfg, + state_id, + [src_node, dst_node], + ) + else: + raise NotImplementedError("The requested copy operation is not implemented.") # Post-copy synchronization if is_sync: @@ -1126,7 +1169,6 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St # Obtain copy information copy_shape, src_strides, dst_strides, src_expr, dst_expr = (memlet_copy_to_absolute_strides( self._dispatcher, sdfg, state, edge, src_node, dst_node, self._cpu_codegen._packed_types)) - dims = len(copy_shape) funcname = 'dace::%sTo%s%dD' % (_get_storagename(src_storage), _get_storagename(dst_storage), dims) diff --git a/dace/data.py b/dace/data.py index 74c1e8b985..3279aff63b 100644 --- a/dace/data.py +++ b/dace/data.py @@ -210,6 +210,8 @@ def _validate(self): if any(not isinstance(s, (int, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic)) for s in self.shape): raise TypeError('Shape must be a list or tuple of integer values ' 'or symbols') + if any((shp < 0) == True for shp in self.shape): + raise TypeError(f'Found negative shape in Data, its shape was {self.shape}') return True def to_json(self): @@ -1471,12 +1473,20 @@ def validate(self): super(Array, self).validate() if len(self.strides) != len(self.shape): raise TypeError('Strides must be the same size as shape') + if len(self.offset) != len(self.shape): + raise TypeError('Offset must be the same size as shape') if any(not isinstance(s, (int, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic)) for s in self.strides): raise TypeError('Strides must be a list or tuple of integer values or symbols') - - if len(self.offset) != len(self.shape): - raise TypeError('Offset must be the same size as shape') + if any(not isinstance(off, (int, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic)) + for off in self.offset): + raise TypeError('Offset must be a list or tuple of integer values or symbols') + + # Actually it would be enough to only enforce the non negativity only if the shape is larger than one. + if any((stride < 0) == True for stride in self.strides): + raise TypeError(f'Found negative strides in array, they were {self.strides}') + if (self.total_size < 0) == True: + raise TypeError(f'The total size of an array must be positive but it was negative {self.total_size}') def covers_range(self, rng): if len(rng) != len(self.shape): diff --git a/dace/memlet.py b/dace/memlet.py index 46dac51edf..090a7890fa 100644 --- a/dace/memlet.py +++ b/dace/memlet.py @@ -534,6 +534,9 @@ def dst_subset(self, new_dst_subset): def validate(self, sdfg, state): if self.data is not None and self.data not in sdfg.arrays: raise KeyError('Array "%s" not found in SDFG' % self.data) + # NOTE: We do not check here is the subsets have a negative size, because such as subset + # is valid, in certain cases, for example if an AccessNode is connected to a MapEntry, + # because the Map is not executed. Thus we do the check in the `validate_state()` function. def used_symbols(self, all_symbols: bool, edge=None) -> Set[str]: """ diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py index ccfb0adada..f501697b57 100644 --- a/dace/sdfg/validation.py +++ b/dace/sdfg/validation.py @@ -9,7 +9,7 @@ import networkx as nx -from dace import dtypes, subsets, symbolic +from dace import dtypes, subsets, symbolic, data from dace.dtypes import DebugInfo if TYPE_CHECKING: @@ -656,7 +656,6 @@ def validate_state(state: 'dace.sdfg.SDFGState', ) ######################################## - # Memlet checks for eid, e in enumerate(state.edges()): # Reference check if id(e) in references: @@ -680,6 +679,27 @@ def validate_state(state: 'dace.sdfg.SDFGState', except Exception as ex: raise InvalidSDFGEdgeError("Edge validation failed: " + str(ex), sdfg, state_id, eid) + # If the edge is a connection between two AccessNodes check if the subset has negative size. + # NOTE: We _should_ do this check in `Memlet.validate()` however, this is not possible, + # because the connection between am AccessNode and a MapEntry, with a negative size, is + # legal because, the Map will not run in that case. However, this constellation can not + # be tested for in the Memlet's validation function, so we have to do it here. + # NOTE: Zero size is explicitly allowed because it is essentially `memcpy(dst, src, 0)` + # which is save. + # TODO: The AN to AN connection is the most obvious one, but it should be extended. + if isinstance(e.src, nd.AccessNode) and isinstance(e.dst, nd.AccessNode): + e_memlet: dace.Memlet = e.data + if e_memlet.subset is not None: + if any((ss < 0) == True for ss in e_memlet.subset.size()): + raise InvalidSDFGEdgeError( + f'`subset` of an AccessNode to AccessNode Memlet contains a negative size; the size was {e_memlet.subset.size()}', + sdfg, state_id, eid) + if e_memlet.other_subset is not None: + if any((ss < 0) == True for ss in e_memlet.other_subset.size()): + raise InvalidSDFGEdgeError( + f'`other_subset` of an AccessNode to AccessNode Memlet contains a negative size; the size was {e_memlet.other_subset.size()}', + sdfg, state_id, eid) + # For every memlet, obtain its full path in the DFG path = state.memlet_path(e) src_node = path[0].src diff --git a/tests/codegen/cuda_memcopy_test.py b/tests/codegen/cuda_memcopy_test.py index 36c5d19f7a..34853f0adb 100644 --- a/tests/codegen/cuda_memcopy_test.py +++ b/tests/codegen/cuda_memcopy_test.py @@ -1,8 +1,10 @@ """ Tests code generation for array copy on GPU target. """ import dace from dace.transformation.auto import auto_optimize +from dace.sdfg import nodes as dace_nodes import pytest +import copy import re # this test requires cupy module @@ -12,6 +14,237 @@ rng = cp.random.default_rng(42) +def count_node(sdfg: dace.SDFG, node_type): + nb_nodes = 0 + for rsdfg in sdfg.all_sdfgs_recursive(): + for state in sdfg.states(): + for node in state.nodes(): + if isinstance(node, node_type): + nb_nodes += 1 + return nb_nodes + + +def _make_2d_gpu_copy_sdfg(c_order: bool, ) -> dace.SDFG: + """The SDFG performs a copy from the input of the output, that is continuous. + + Essentially the function will generate am SDFG that performs the following + operation: + ```python + B[2:7, 3:9] = A[1:6, 2:8] + ``` + However, two arrays have a shape of `(20, 30)`. This means that this copy + can not be expressed as a continuous copy. Regardless which memory order + that is used, which can be selected by `c_order`. + """ + sdfg = dace.SDFG(f'gpu_2d_copy_{"corder" if c_order else "forder"}_copy_sdfg') + state = sdfg.add_state(is_start_block=True) + + for aname in 'AB': + sdfg.add_array( + name=aname, + shape=(20, 30), + dtype=dace.float64, + storage=dace.StorageType.GPU_Global, + transient=False, + strides=((30, 1) if c_order else (1, 20)), + ) + + state.add_nedge( + state.add_access("A"), + state.add_access("B"), + dace.Memlet("A[1:6, 2:8] -> [2:7, 3:9]"), + ) + sdfg.validate() + + return sdfg + + +def _perform_2d_gpu_copy_test(c_order: bool, ): + """Check 2D strided copies are handled by the `Memcpy2D` family. + """ + sdfg = _make_2d_gpu_copy_sdfg(c_order=c_order) + assert count_node(sdfg, dace_nodes.AccessNode) == 2 + assert count_node(sdfg, dace_nodes.MapEntry) == 0 + + # Now generate the code. + csdfg = sdfg.compile() + + # Ensure that the copy was not turned into a Map + assert count_node(csdfg.sdfg, dace_nodes.AccessNode) == 2 + assert count_node(csdfg.sdfg, dace_nodes.MapEntry) == 0 + + # Ensure that the correct call was issued. + # We have to look at the CPU code and not at the GPU. + code = sdfg.generate_code()[0].clean_code + m = re.search(r'(cuda|hip)Memcpy2DAsync\b', code) + assert m is not None + + # Generate input data. + ref = { + "A": cp.array(cp.random.rand(20, 30), dtype=cp.float64, order="C" if c_order else "F"), + "B": cp.array(cp.random.rand(20, 30), dtype=cp.float64, order="C" if c_order else "F"), + } + + # We can not use `deepcopy` or `.copy()` because this would set the strides to `C` order. + res = {} + for name in ref.keys(): + res[name] = cp.empty_like(ref[name]) + res[name][:] = ref[name][:] + + exp_strides = (240, 8) if c_order else (8, 160) + assert all(v.strides == exp_strides for v in ref.values()) + assert all(v.strides == exp_strides for v in res.values()) + + # Now apply the operation on the reference + ref["B"][2:7, 3:9] = ref["A"][1:6, 2:8] + + # Now run the SDFG + csdfg(**res) + + assert all(cp.all(ref[k] == res[k]) for k in ref.keys()) + + +def _make_1d_gpu_copy( + src_row: bool, + dst_row: bool, +) -> dace.SDFG: + sdfg = dace.SDFG(f'gpu_1d_copy_{"row" if src_row else "col"}_{"row" if src_row else "col"}_copy_sdfg') + state = sdfg.add_state(is_start_block=True) + + for aname in 'AB': + sdfg.add_array( + name=aname, + shape=(20, 20), + dtype=dace.float64, + storage=dace.StorageType.GPU_Global, + transient=False, + ) + + src_subset = "1, 1:9" if src_row else "1:9, 2" + dst_subset = "3, 0:8" if dst_row else "0:8, 4" + + state.add_nedge( + state.add_access("A"), + state.add_access("B"), + dace.Memlet(f"A[{src_subset}] -> [{dst_subset}]"), + ) + sdfg.validate() + return sdfg + + +def _perform_1d_gpu_copy( + src_row: bool, + dst_row: bool, +): + sdfg = _make_1d_gpu_copy(src_row=src_row, dst_row=dst_row) + assert count_node(sdfg, dace_nodes.AccessNode) == 2 + assert count_node(sdfg, dace_nodes.MapEntry) == 0 + + # Now generate the code. + csdfg = sdfg.compile() + + # Ensure that the copy was not turned into a Map + assert count_node(csdfg.sdfg, dace_nodes.AccessNode) == 2 + assert count_node(csdfg.sdfg, dace_nodes.MapEntry) == 0 + + # It will always result in a call to `Memcpy2D` except the source and the destination + # operates on rows, then it is a simple 1D copy. + if src_row and dst_row: + code = sdfg.generate_code()[0].clean_code + m = re.search(r'(cuda|hip)MemcpyAsync\b', code) + assert m is not None + else: + code = sdfg.generate_code()[0].clean_code + m = re.search(r'(cuda|hip)Memcpy2DAsync\b', code) + assert m is not None + + # Generate input data. + ref = { + "A": cp.array(cp.random.rand(20, 20), dtype=cp.float64, order="C"), + "B": cp.array(cp.random.rand(20, 20), dtype=cp.float64, order="C"), + } + res = {k: v.copy() for k, v in ref.items()} + + # Now perform the reference operation + src_subset = ref["A"][1, 1:9] if src_row else ref["A"][1:9, 2] + if dst_row: + ref["B"][3, 0:8] = src_subset + else: + ref["B"][0:8, 4] = src_subset + + # Now run the SDFG + csdfg(**res) + + assert all(cp.all(ref[k] == res[k]) for k in ref.keys()) + + +def _make_pseudo_1d_copy_sdfg(c_order: bool, ) -> dace.SDFG: + """An SDFG that performs a 2D copy that can be turned into a 1d copy. + """ + sdfg = dace.SDFG(f'gpu_pseudo_1d_copy_{"corder" if c_order else "forder"}_sdfg') + state = sdfg.add_state(is_start_block=True) + + for aname in 'AB': + sdfg.add_array( + name=aname, + shape=(20, 30), + dtype=dace.float64, + storage=dace.StorageType.GPU_Global, + transient=False, + strides=((30, 1) if c_order else (1, 20)), + ) + + cpy_subset = "1:18, 0:30" if c_order else "0:20, 2:29" + state.add_nedge( + state.add_access("A"), + state.add_access("B"), + dace.Memlet(f"A[{cpy_subset}] -> [{cpy_subset}]"), + ) + sdfg.validate() + + return sdfg + + +def _perform_pseudo_1d_copy_test(c_order: bool): + sdfg = _make_pseudo_1d_copy_sdfg(c_order=c_order) + assert count_node(sdfg, dace_nodes.AccessNode) == 2 + assert count_node(sdfg, dace_nodes.MapEntry) == 0 + + # Now generate the code. + csdfg = sdfg.compile() + + # Ensure that the copy was not turned into a Map + assert count_node(csdfg.sdfg, dace_nodes.AccessNode) == 2 + assert count_node(csdfg.sdfg, dace_nodes.MapEntry) == 0 + + code = sdfg.generate_code()[0].clean_code + m = re.search(r'(cuda|hip)MemcpyAsync\b', code) + assert m is not None + + # Generate input data. + ref = { + "A": cp.array(cp.random.rand(20, 30), dtype=cp.float64, order="C" if c_order else "F"), + "B": cp.array(cp.random.rand(20, 30), dtype=cp.float64, order="C" if c_order else "F"), + } + + # We can not use `deepcopy` or `.copy()` because this would set the strides to `C` order. + res = {} + for name in ref.keys(): + res[name] = cp.empty_like(ref[name]) + res[name][:] = ref[name][:] + + # Perform the reference computation. + if c_order: + ref["B"][1:18, 0:30] = ref["A"][1:18, 0:30] + else: + ref["B"][0:20, 2:29] = ref["A"][0:20, 2:29] + + # Now run the SDFG + csdfg(**res) + + assert all(cp.all(ref[k] == res[k]) for k in ref.keys()) + + @pytest.mark.gpu def test_gpu_shared_to_global_1D(): M = 32 @@ -88,6 +321,96 @@ def transpose_and_add_shared_to_global(A: dace.float64[M, N], B: dace.float64[N, assert m is not None +@pytest.mark.gpu +def test_gpu_1d_copy(): + sdfg = dace.SDFG("gpu_1d_copy_sdfg") + state = sdfg.add_state(is_start_block=True) + + for aname in 'AB': + sdfg.add_array( + name=aname, + shape=(20, ), + dtype=dace.float64, + storage=dace.StorageType.GPU_Global, + transient=False, + ) + state.add_nedge( + state.add_access("A"), + state.add_access("B"), + dace.Memlet("A[2:13] -> [1:12]"), + ) + sdfg.validate() + + csdfg = sdfg.compile() + assert count_node(csdfg.sdfg, dace_nodes.AccessNode) == 2 + assert count_node(csdfg.sdfg, dace_nodes.MapEntry) == 0 + + code = sdfg.generate_code()[0].clean_code + m = re.search(r'(cuda|hip)MemcpyAsync\b', code) + assert m is not None + + # Now run the sdfg. + ref = { + "A": cp.array(cp.random.rand(20), dtype=cp.float64), + "B": cp.array(cp.random.rand(20), dtype=cp.float64), + } + res = {k: v.copy() for k, v in ref.items()} + + ref["B"][1:12] = ref["A"][2:13] + csdfg(**res) + + assert all(cp.all(ref[k] == res[k]) for k in ref.keys()) + + +@pytest.mark.gpu +def test_2d_c_order_gpu_copy(): + _perform_2d_gpu_copy_test(c_order=True) + + +@pytest.mark.gpu +def test_2d_f_order_gpu_copy(): + _perform_2d_gpu_copy_test(c_order=False) + + +@pytest.mark.gpu +def test_gpu_1d_copy_row_row(): + _perform_1d_gpu_copy(src_row=True, dst_row=True) + + +@pytest.mark.gpu +def test_gpu_1d_copy_row_col(): + _perform_1d_gpu_copy(src_row=True, dst_row=False) + + +@pytest.mark.gpu +def test_gpu_1d_copy_col_col(): + _perform_1d_gpu_copy(src_row=False, dst_row=False) + + +@pytest.mark.gpu +def test_gpu_1d_copy_col_row(): + _perform_1d_gpu_copy(src_row=False, dst_row=True) + + +@pytest.mark.gpu +def test_gpu_pseudo_1d_copy_c_order(): + _perform_pseudo_1d_copy_test(c_order=True) + + +@pytest.mark.gpu +def test_gpu_pseudo_1d_copy_f_order(): + _perform_pseudo_1d_copy_test(c_order=False) + + if __name__ == '__main__': test_gpu_shared_to_global_1D() test_gpu_shared_to_global_1D_accumulate() + test_2d_c_order_copy() + test_2d_f_order_copy() + test_gpu_1d_copy_row_row() + test_gpu_1d_copy_row_col() + test_gpu_1d_copy_col_row() + test_gpu_1d_copy_col_col() + test_gpu_1d_copy() + test_gpu_pseudo_1d_copy_c_order() + test_gpu_pseudo_1d_copy_f_order() diff --git a/tests/sdfg/validation/subset_size_test.py b/tests/sdfg/validation/subset_size_test.py new file mode 100644 index 0000000000..bc01b85a12 --- /dev/null +++ b/tests/sdfg/validation/subset_size_test.py @@ -0,0 +1,83 @@ +from typing import Tuple + +import dace + +import re +import pytest +import numpy as np + + +def _make_sdfg_with_zero_sized_an_to_an_memlet() -> Tuple[dace.SDFG, dace.SDFGState]: + """Generates an SDFG that performs a copy that has a zero size. + """ + sdfg = dace.SDFG("zero_size_copy_sdfg") + state = sdfg.add_state(is_start_block=True) + + for name in "AB": + sdfg.add_array( + name=name, + shape=(20, 20), + dtype=dace.float64, + transient=True, + ) + + state.add_nedge( + state.add_access("A"), + state.add_access("B"), + dace.Memlet("A[2:17, 2:2] -> [2:18, 3:3]"), + ) + + return sdfg, state + + +def test_an_to_an_memlet_with_zero_size(): + sdfg, state = _make_sdfg_with_zero_sized_an_to_an_memlet() + assert sdfg.number_of_nodes() == 1 + assert state.number_of_nodes() == 2 + + sdfg.validate() + + # This zero sized copy should be considered valid. + assert sdfg.is_valid() + + # The SDFG should be a no ops. + ref = { + "A": np.array(np.random.rand(20, 20), copy=True, order="C", dtype=np.float64), + "B": np.array(np.random.rand(20, 20), copy=True, order="C", dtype=np.float64), + } + res = {k: np.array(v, order="C", copy=True) for k, v in ref.items()} + + csdfg = sdfg.compile() + assert csdfg.sdfg.number_of_nodes() == 1 + assert csdfg.sdfg.states()[0].number_of_nodes() == 2 + csdfg(**res) + + assert all(np.all(ref[k] == res[k]) for k in ref.keys()) + + +def test_an_to_an_memlet_with_negative_size(): + """Tests if an AccessNode to AccessNode connection leads to an invalid SDFG. + """ + sdfg = dace.SDFG("an_to_an_memlet_with_negative_size") + state = sdfg.add_state(is_start_block=True) + + for name in "AB": + sdfg.add_array( + name=name, + shape=(20, 20), + dtype=dace.float64, + transient=True, + ) + + state.add_nedge( + state.add_access("A"), + state.add_access("B"), + dace.Memlet("A[2:17, 13:2] -> [2:18, 14:3]"), + ) + + with pytest.raises( + expected_exception=dace.sdfg.InvalidSDFGEdgeError, + match=re.escape( + f'`subset` of an AccessNode to AccessNode Memlet contains a negative size; the size was [15, -11]'), + ): + sdfg.validate() From 8b9b674abfcbd0148a32a87fa6d88e240030435d Mon Sep 17 00:00:00 2001 From: Philip Mueller Date: Thu, 1 May 2025 12:14:03 +0200 Subject: [PATCH 3/4] Squashed commit of the following: commit 5197dcf3b0604de750a018204e4a8c9559633362 Author: Edoardo Paone Date: Wed Apr 30 07:43:10 2025 +0200 fix in cpp codegen commit c01760ca6d13769e1ce7ea9defc1ef7098e89e1b Author: Edoardo Paone Date: Wed Apr 30 07:42:53 2025 +0200 Revert "add default - needed" This reverts commit 47d65ac573f210baf5ccaad5a17c85f9c7480ed6. commit 580f743a2aec8cad4aae24d89768717d16bb7bb1 Author: Edoardo Paone Date: Wed Apr 30 07:42:05 2025 +0200 Revert "edit" This reverts commit a7138d252115a0bf313d212e086d387214dadbc3. commit a7138d252115a0bf313d212e086d387214dadbc3 Author: Edoardo Paone Date: Wed Apr 30 07:26:14 2025 +0200 edit commit 47d65ac573f210baf5ccaad5a17c85f9c7480ed6 Author: Edoardo Paone Date: Tue Apr 29 23:27:55 2025 +0200 add default - needed commit a10b5b3d01c7072dddbff02e1569e75630efae17 Author: Edoardo Paone Date: Tue Apr 29 23:07:09 2025 +0200 minor edit commit 40395a8c46e0ebf42d0180a88f5471ba1924122c Author: Edoardo Paone Date: Tue Apr 29 22:42:22 2025 +0200 use ContextVar for _in_device_code --- dace/codegen/targets/cpp.py | 2 +- dace/codegen/targets/cuda.py | 38 ++++++++++++++++++------------------ 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py index 5c4d04c0a7..38e16e72d6 100644 --- a/dace/codegen/targets/cpp.py +++ b/dace/codegen/targets/cpp.py @@ -257,7 +257,7 @@ def ptr(name: str, desc: data.Data, sdfg: SDFG = None, framecode=None) -> str: if desc.storage == dtypes.StorageType.CPU_ThreadLocal: # Use unambiguous name for thread-local arrays return f'__{sdfg.cfg_id}_{name}' - elif not CUDACodeGen._in_device_code: # GPU kernels cannot access state + elif not CUDACodeGen._in_device_code.get(): # GPU kernels cannot access state return f'__state->__{sdfg.cfg_id}_{name}' elif (sdfg, name) in framecode.where_allocated and framecode.where_allocated[(sdfg, name)] is not sdfg: return f'__{sdfg.cfg_id}_{name}' diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index 7ea1b582a6..cbff4954af 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -1,4 +1,5 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import contextvars import ctypes import functools import warnings @@ -60,7 +61,7 @@ class CUDACodeGen(TargetCodeGenerator): """ GPU (CUDA/HIP) code generator. """ target_name = 'cuda' title = 'CUDA' - _in_device_code = False + _in_device_code = contextvars.ContextVar('_in_device_code') def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._frame = frame_codegen @@ -70,7 +71,7 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self.create_grid_barrier = False self.dynamic_tbmap_type = None self.extra_nsdfg_args = [] - CUDACodeGen._in_device_code = False + CUDACodeGen._in_device_code.set(False) self._cpu_codegen: Optional['CPUCodeGen'] = None self._block_dims = None self._grid_dims = None @@ -454,7 +455,7 @@ def node_dispatch_predicate(self, sdfg, state, node): if hasattr(node, 'schedule'): # NOTE: Works on nodes and scopes if node.schedule in dtypes.GPU_SCHEDULES: return True - if CUDACodeGen._in_device_code: + if CUDACodeGen._in_device_code.get(): return True return False @@ -921,7 +922,7 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St raise LookupError('Memlet does not point to any of the nodes') if (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode) - and not CUDACodeGen._in_device_code + and not CUDACodeGen._in_device_code.get() and (src_storage in [dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned] or dst_storage in [dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned]) and not (src_storage in cpu_storage_types and dst_storage in cpu_storage_types)): @@ -1284,7 +1285,7 @@ def generate_state(self, callsite_stream: CodeIOStream, generate_state_footer: bool = False) -> None: # Two modes: device-level state and if this state has active streams - if CUDACodeGen._in_device_code: + if CUDACodeGen._in_device_code.get(): self.generate_devicelevel_state(sdfg, cfg, state, function_stream, callsite_stream) else: # Active streams found. Generate state normally and sync with the @@ -1509,10 +1510,9 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub outer_name = cpp.ptr(node.data, desc, nsdfg, self._frame) # Create name from within kernel - oldval = CUDACodeGen._in_device_code - CUDACodeGen._in_device_code = True + token = CUDACodeGen._in_device_code.set(True) inner_name = cpp.ptr(node.data, desc, nsdfg, self._frame) - CUDACodeGen._in_device_code = oldval + CUDACodeGen._in_device_code.reset(token) self.extra_nsdfg_args.append((desc.as_arg(name=''), inner_name, outer_name)) self._dispatcher.defined_vars.add(inner_name, @@ -1572,9 +1572,9 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub if not defined_type: defined_type, ctype = self._dispatcher.defined_vars.get(ptrname, is_global=is_global) - CUDACodeGen._in_device_code = True + token = CUDACodeGen._in_device_code.set(True) inner_ptrname = cpp.ptr(aname, data_desc, sdfg, self._frame) - CUDACodeGen._in_device_code = False + CUDACodeGen._in_device_code.reset(token) self._dispatcher.defined_vars.add(inner_ptrname, defined_type, @@ -1591,9 +1591,9 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External) defined_type, ctype = self._dispatcher.defined_vars.get(ptrname, is_global=is_global) - CUDACodeGen._in_device_code = True + token = CUDACodeGen._in_device_code.set(True) inner_ptrname = cpp.ptr(aname, data_desc, sdfg, self._frame) - CUDACodeGen._in_device_code = False + CUDACodeGen._in_device_code.reset(token) self._dispatcher.defined_vars.add(inner_ptrname, defined_type, ctype, allow_shadowing=True) # Rename argument in kernel prototype as necessary @@ -2101,8 +2101,8 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, tidtype.ctype) # Dispatch internal code - assert CUDACodeGen._in_device_code is False - CUDACodeGen._in_device_code = True + assert CUDACodeGen._in_device_code.get() is False + CUDACodeGen._in_device_code.set(True) self._kernel_map = node self._kernel_state = cfg.node(state_id) self._block_dims = block_dims @@ -2155,7 +2155,7 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S self._block_dims = None self._kernel_map = None self._kernel_state = None - CUDACodeGen._in_device_code = False + CUDACodeGen._in_device_code.set(False) self._grid_dims = None self.dynamic_tbmap_type = None @@ -2180,7 +2180,7 @@ def get_next_scope_entries(self, dfg, scope_entry): def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: # Sanity check - assert CUDACodeGen._in_device_code == True + assert CUDACodeGen._in_device_code.get() == True dfg = cfg.state(state_id) scope_entry = dfg_scope.source_nodes()[0] @@ -2610,14 +2610,14 @@ def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphVi gen(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) return - if not CUDACodeGen._in_device_code: + if not CUDACodeGen._in_device_code.get(): self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) return if isinstance(node, nodes.ExitNode): self._locals.clear_scope(self._code_state.indentation + 1) - if CUDACodeGen._in_device_code and isinstance(node, nodes.MapExit): + if CUDACodeGen._in_device_code.get() and isinstance(node, nodes.MapExit): return # skip self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) @@ -2739,7 +2739,7 @@ def _generate_condition_from_location(self, name: str, index_expr: str, node: no def _generate_Tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Tasklet, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: generated_preamble_scopes = 0 - if self._in_device_code: + if self._in_device_code.get(): # If location dictionary prescribes that the code should run on a certain group of threads/blocks, # add condition generated_preamble_scopes += self._generate_condition_from_location('gpu_thread', self._get_thread_id(), From 23466cd6b80b120e7cdcbb4d11e3766f305d0c2c Mon Sep 17 00:00:00 2001 From: Ioannis Magkanaris Date: Wed, 7 May 2025 14:16:59 +0200 Subject: [PATCH 4/4] Added compilation flag for line info and removed fast_math flag from CUDA compilation flags --- dace/config_schema.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/config_schema.yml b/dace/config_schema.yml index b5a7914018..189931ff3a 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -303,7 +303,7 @@ required: type: str title: nvcc Arguments description: Compiler argument flags for CUDA - default: '-Xcompiler -march=native --use_fast_math -Xcompiler -Wno-unused-parameter' + default: '--generate-line-info -Xcompiler -march=native -Xcompiler -Wno-unused-parameter' default_Windows: '-O3 --use_fast_math' hip_args: