From 0dc9af988404a2f5b41bdd8cba3255d57091cc8d Mon Sep 17 00:00:00 2001
From: Philip Mueller <philip.mueller@cscs.ch>
Date: Thu, 1 May 2025 12:12:41 +0200
Subject: [PATCH 1/4] Squashed commit of the following:

commit 635da6c4e64fc983397eb2f90af4f102eb92b286
Author: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date:   Fri Apr 25 15:40:25 2025 +0200

    Add NVTX range in CUDA GPU kernel call of program
---
 dace/codegen/CMakeLists.txt       | 2 +-
 dace/codegen/targets/framecode.py | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/dace/codegen/CMakeLists.txt b/dace/codegen/CMakeLists.txt
index 5482d4d30d..7d1ca4d714 100644
--- a/dace/codegen/CMakeLists.txt
+++ b/dace/codegen/CMakeLists.txt
@@ -141,7 +141,7 @@ if(DACE_ENABLE_CUDA)
 
   set(CMAKE_CUDA_ARCHITECTURES "${LOCAL_CUDA_ARCHITECTURES}")
   enable_language(CUDA)
-  list(APPEND DACE_LIBS CUDA::cudart)
+  list(APPEND DACE_LIBS CUDA::cudart CUDA::nvtx3)
   add_definitions(-DWITH_CUDA)
 
   if (MSVC_IDE)
diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index 33bc562f73..7a62170c3d 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -140,6 +140,8 @@ def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream, backend:
         if backend == 'frame':
             global_stream.write('#include "../../include/hash.h"\n', sdfg)
 
+        global_stream.write('#ifdef WITH_CUDA\n#include <nvtx3/nvToolsExt.h>\n#endif\n', sdfg)
+
         #########################################################
         # Environment-based includes
         for env in self.environments:
@@ -266,7 +268,13 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre
             f'''
 DACE_EXPORTED void __program_{fname}({mangle_dace_state_struct_name(fname)} *__state{params_comma})
 {{
+    #ifdef WITH_CUDA
+    nvtxRangePushA("{fname}");
+    #endif
     __program_{fname}_internal(__state{paramnames_comma});
+    #ifdef WITH_CUDA
+    nvtxRangePop();
+    #endif
 }}''', sdfg)
 
         for target in self._dispatcher.used_targets:

From d495495ce44773358366c82f48aa39fe23391bd9 Mon Sep 17 00:00:00 2001
From: Philip Mueller <philip.mueller@cscs.ch>
Date: Thu, 1 May 2025 12:13:11 +0200
Subject: [PATCH 2/4] Squashed commit of the following:

commit aef99452609f49f3d0d00f911a4b89f9a25f191f
Author: Philip Mueller <philip.mueller@cscs.ch>
Date:   Tue Mar 25 06:52:35 2025 +0100

    As an experiment removed some code I think is useless, let's see what the tests say.

commit e5bf87f0ec0252857702d8ffded92dd87d2040b1
Author: Philip Mueller <philip.mueller@cscs.ch>
Date:   Mon Mar 24 07:05:18 2025 +0100

    Added a comment to address the possible issues with viewes.

commit ba978742791c274e4f7bd79c506e7c6b392034e6
Merge: b0b994551 4245396e4
Author: Philip Mueller <philip.mueller@cscs.ch>
Date:   Fri Mar 21 16:07:12 2025 +0100

    Merge remote-tracking branch 'spcl/main' into improved-2d-copy

commit b0b994551bab28b1a7ca32d072a6d983aac71277
Author: Philip Mueller <philip.mueller@cscs.ch>
Date:   Fri Mar 21 16:06:09 2025 +0100

    Added Alexnicks's suggestions.

commit 065e0d7456f74e1f8b9c420fb20325dc4f1fa490
Author: Philip Mueller <philip.mueller@cscs.ch>
Date:   Mon Mar 17 08:56:39 2025 +0100

    Added tests to ensure that the new verification works as expected.

commit 51182e58da7773dafd62694935df281ff1ee083a
Author: Philip Mueller <philip.mueller@cscs.ch>
Date:   Mon Mar 17 08:14:14 2025 +0100

    Moved the test for negative sized subsets from the Memlet to the `vaidate_state()` function.

    The reason is that in some cases this is valid, for example if an edge connects an AccessNode and a MapEntry, because, in that case the map might not be executed.
    Since the Memlet does not have access to its source and destination node it can not check that, so the test was moved to a location that can do this check.
    However, it only does the check for AN to AN connections, which is a bit restrictive, but this is something for later.

commit 2801967a71cb71a902e30973fd19acf5c566e423
Author: Philip Mueller <philip.mueller@cscs.ch>
Date:   Mon Mar 17 07:41:51 2025 +0100

    I am not sure why the printout of the edge is not correct, but it is not where I though I found it.

commit 3166302f30240db6332c645eb40903d1dea78938
Author: Philip Mueller <philip.paul.mueller@bluemain.ch>
Date:   Sat Mar 15 08:23:21 2025 +0100

    Fixed some issue and made it more logical.

commit 02d87b55476569f839420a1c7b62dc3e364b9173
Merge: 801adb105 d13079215
Author: Philip Mueller <philip.paul.mueller@bluemain.ch>
Date:   Sat Mar 15 08:18:08 2025 +0100

    Merge remote-tracking branch 'spcl/main' into improved-2d-copy

commit 801adb105b0f758b727b50ddf425ec3158a8f4da
Author: Philip Mueller <philip.paul.mueller@bluemain.ch>
Date:   Sat Mar 15 08:15:45 2025 +0100

    Added more verification.

commit 66b43f869b929ac969c75206fb2ce58a9d706059
Author: Philip Mueller <philip.paul.mueller@bluemain.ch>
Date:   Sat Mar 15 07:57:03 2025 +0100

    Simplified some check.

commit 76a1a58e5c61bfc09f97fbb56d6aff0d20d8f94f
Author: Philip Mueller <philip.mueller@cscs.ch>
Date:   Fri Mar 14 15:20:49 2025 +0100

    Added a new test for the pseudo 1d case, i.e. when we reduce a copy 2D copy to a 1d copy, because it happens to be continiously allocated.

commit 0b15a7407f1789e4fd9500650187e858ba394fe7
Author: Philip Mueller <philip.mueller@cscs.ch>
Date:   Fri Mar 14 14:59:03 2025 +0100

    Added a note about wrong usage of eid in validation.

commit 322ecda119300f7402e53b81e7a340dbd3ba5aed
Author: Philip Mueller <philip.mueller@cscs.ch>
Date:   Fri Mar 14 14:52:28 2025 +0100

    Improved memlet checking.

commit 61ea7a6874d7c1b884b918413bbed84ca6b5259d
Author: Philip Mueller <philip.mueller@cscs.ch>
Date:   Fri Mar 14 14:43:27 2025 +0100

    Added a new test to the SDFG.

commit a67ad2a6d415d8f3431b22bb918d8ee3fba78f21
Author: Philip Mueller <philip.mueller@cscs.ch>
Date:   Fri Mar 14 14:36:02 2025 +0100

    Added now also test for testing strided 1d copy.

commit c931b9193c86339048c65cbbe69887b9b33aba9e
Author: Philip Mueller <philip.mueller@cscs.ch>
Date:   Fri Mar 14 14:07:16 2025 +0100

    Now 2D copies works, more tests needed.

commit d0a396f929276be5a5994c8afb4dd1b465e11983
Author: Philip Mueller <philip.mueller@cscs.ch>
Date:   Fri Mar 14 13:42:18 2025 +0100

    Updated the memlet copying, I think I now have all the cases will now make the tests.

commit 9b49c9e36c45a5b295c9042031e379724342ebda
Author: Philip Mueller <philip.mueller@cscs.ch>
Date:   Fri Mar 14 11:16:12 2025 +0100

    Made a first version of the new copy implementation.

    But I have to run the unit tests.

commit feea97f46221773479bc8b8e73801e8fbe49abbe
Author: Philip Mueller <philip.mueller@cscs.ch>
Date:   Fri Mar 14 10:18:08 2025 +0100

    Started with the implementation of a better copy, but I have to fix it more.
---
 dace/codegen/targets/cuda.py              | 148 ++++++----
 dace/data.py                              |  16 +-
 dace/memlet.py                            |   3 +
 dace/sdfg/validation.py                   |  24 +-
 tests/codegen/cuda_memcopy_test.py        | 323 ++++++++++++++++++++++
 tests/sdfg/validation/subset_size_test.py |  83 ++++++
 6 files changed, 539 insertions(+), 58 deletions(-)
 create mode 100644 tests/sdfg/validation/subset_size_test.py

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index aaba068da3..7ea1b582a6 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -161,35 +161,40 @@ def preprocess(self, sdfg: SDFG) -> None:
                 nsdfg = state.parent
                 if (e.src.desc(nsdfg).storage == dtypes.StorageType.GPU_Global
                         and e.dst.desc(nsdfg).storage == dtypes.StorageType.GPU_Global):
+
+                    # NOTE: If possible `memlet_copy_to_absolute_strides()` will collapse a
+                    #   ND copy into a 1D copy if the memory is contiguous. In that case
+                    #   `copy_shape` will only have one element.
                     copy_shape, src_strides, dst_strides, _, _ = memlet_copy_to_absolute_strides(
                         None, nsdfg, state, e, e.src, e.dst)
                     dims = len(copy_shape)
 
                     # Skip supported copy types
                     if dims == 1:
+                        # NOTE: We do not check if the stride is `1`. See `_emit_copy()` for more.
                         continue
                     elif dims == 2:
-                        if src_strides[-1] != 1 or dst_strides[-1] != 1:
-                            # NOTE: Special case of continuous copy
-                            # Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J]
-                            # with copy shape [I, J] and strides [J*K, K], [J, 1]
-                            try:
-                                is_src_cont = src_strides[0] / src_strides[1] == copy_shape[1]
-                                is_dst_cont = dst_strides[0] / dst_strides[1] == copy_shape[1]
-                            except (TypeError, ValueError):
-                                is_src_cont = False
-                                is_dst_cont = False
-                            if is_src_cont and is_dst_cont:
-                                continue
-                        else:
+                        # Because `memlet_copy_to_absolute_strides()` handles contiguous copies
+                        #  transparently, we only have to check if we have FORTRAN or C order.
+                        #  If we do not have them, then we have to turn this into a Map.
+                        is_fortran_order = src_strides[0] == 1 and dst_strides[0] == 1
+                        is_c_order = src_strides[-1] == 1 and dst_strides[-1] == 1
+                        if is_c_order or is_fortran_order:
                             continue
                     elif dims > 2:
-                        if not (src_strides[-1] != 1 or dst_strides[-1] != 1):
+                        # Any higher dimensional copies must be C order. If not turn it
+                        #  into a copy map.
+                        if src_strides[-1] == 1 and dst_strides[-1] == 1:
                             continue
 
                     # Turn unsupported copy to a map
                     try:
-                        CopyToMap.apply_to(nsdfg, save=False, annotate=False, a=e.src, b=e.dst)
+                        CopyToMap.apply_to(nsdfg,
+                                           save=False,
+                                           annotate=False,
+                                           a=e.src,
+                                           b=e.dst,
+                                           options={"ignore_strides": True})
                     except ValueError:  # If transformation doesn't match, continue normally
                         continue
 
@@ -973,32 +978,21 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St
             copy_shape, src_strides, dst_strides, src_expr, dst_expr = (memlet_copy_to_absolute_strides(
                 self._dispatcher, sdfg, state_dfg, edge, src_node, dst_node, self._cpu_codegen._packed_types))
             dims = len(copy_shape)
-
             dtype = dst_node.desc(sdfg).dtype
 
-            # Handle unsupported copy types
-            if dims == 2 and (src_strides[-1] != 1 or dst_strides[-1] != 1):
-                # NOTE: Special case of continuous copy
-                # Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J]
-                # with copy shape [I, J] and strides [J*K, K], [J, 1]
-                try:
-                    is_src_cont = src_strides[0] / src_strides[1] == copy_shape[1]
-                    is_dst_cont = dst_strides[0] / dst_strides[1] == copy_shape[1]
-                except (TypeError, ValueError):
-                    is_src_cont = False
-                    is_dst_cont = False
-                if is_src_cont and is_dst_cont:
-                    dims = 1
-                    copy_shape = [copy_shape[0] * copy_shape[1]]
-                    src_strides = [src_strides[1]]
-                    dst_strides = [dst_strides[1]]
-                else:
-                    raise NotImplementedError('2D copy only supported with one stride')
+            # In 1D there is no difference between FORTRAN or C order, thus we will set them
+            #  to the same value. The value indicates if the stride is `1`
+            # TODO: Figuring out if this is enough for views.
+            is_fortran_order = src_strides[0] == 1 and dst_strides[0] == 1
+            is_c_order = src_strides[-1] == 1 and dst_strides[-1] == 1
 
-            # Currently we only support ND copies when they can be represented
-            # as a 1D copy or as a 2D strided copy
             if dims > 2:
-                if src_strides[-1] != 1 or dst_strides[-1] != 1:
+                # Currently we only support ND copies when they can be represented
+                #  as a 1D copy or as a 2D strided copy
+                # NOTE: Not sure if this test is enough, it should also be tested that
+                #   they are ordered, i.e. largest stride on the left.
+                if not is_c_order:
+                    # TODO: Implement the FORTRAN case.
                     raise NotImplementedError(
                         'GPU copies are not supported for N-dimensions if they cannot be represented by a strided copy\n'
                         f'  Nodes: src {src_node} ({src_storage}), dst {dst_node}({dst_storage})\n'
@@ -1026,7 +1020,8 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St
                     for d in range(dims - 2):
                         callsite_stream.write("}")
 
-            if dims == 1 and not (src_strides[-1] != 1 or dst_strides[-1] != 1):
+            elif dims == 1 and is_c_order:
+                # A 1D copy, in which the stride is 1, known at code generation time.
                 copysize = ' * '.join(_topy(copy_shape))
                 array_length = copysize
                 copysize += ' * sizeof(%s)' % dtype.ctype
@@ -1064,22 +1059,70 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St
                                                                                     backend=self.backend), cfg,
                                 state_id, [src_node, dst_node])
                     callsite_stream.write('}')
-            elif dims == 1 and ((src_strides[-1] != 1 or dst_strides[-1] != 1)):
+
+            elif dims == 1 and not is_c_order:
+                # This is the case that generated for expressions such as `A[::3]`, we reduce it
+                #  to a 2D copy.
+                callsite_stream.write(
+                    'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dst_stride}, {src}, {src_stride}, {width}, {height}, {kind}, {stream}));\n'
+                    .format(
+                        backend=self.backend,
+                        dst=dst_expr,
+                        dst_stride=f'({_topy(dst_strides[0])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})',
+                        src=src_expr,
+                        src_stride=f'({sym2cpp(src_strides[0])}) * sizeof({src_node.desc(sdfg).dtype.ctype})',
+                        width=f'sizeof({dst_node.desc(sdfg).dtype.ctype})',
+                        height=sym2cpp(copy_shape[0]),
+                        kind=f'{self.backend}Memcpy{src_location}To{dst_location}',
+                        stream=cudastream,
+                    ),
+                    cfg,
+                    state_id,
+                    [src_node, dst_node],
+                )
+
+            elif dims == 2 and is_c_order:
+                # Copying a 2D array that are in C order, i.e. last stride is 1.
                 callsite_stream.write(
-                    'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' %
-                    (self.backend, dst_expr, _topy(dst_strides[0]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype,
-                     src_expr, sym2cpp(src_strides[0]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype,
-                     'sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, sym2cpp(
-                         copy_shape[0]), self.backend, src_location, dst_location, cudastream), cfg, state_id,
-                    [src_node, dst_node])
-            elif dims == 2:
+                    'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dst_stride}, {src}, {src_stride}, {width}, {height}, {kind}, {stream}));\n'
+                    .format(
+                        backend=self.backend,
+                        dst=dst_expr,
+                        dst_stride=f'({_topy(dst_strides[0])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})',
+                        src=src_expr,
+                        src_stride=f'({sym2cpp(src_strides[0])}) * sizeof({src_node.desc(sdfg).dtype.ctype})',
+                        width=f'({sym2cpp(copy_shape[1])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})',
+                        height=sym2cpp(copy_shape[0]),
+                        kind=f'{self.backend}Memcpy{src_location}To{dst_location}',
+                        stream=cudastream,
+                    ),
+                    cfg,
+                    state_id,
+                    [src_node, dst_node],
+                )
+            elif dims == 2 and is_fortran_order:
+                # Copying a 2D array into a 2D array that is in FORTRAN order, i.e. first stride
+                #  is one. The CUDA API can not handle such cases directly, however, by "transposing"
+                #  it is possible to use `Memcyp2DAsync`.
                 callsite_stream.write(
-                    'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' %
-                    (self.backend, dst_expr, _topy(dst_strides[0]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype,
-                     src_expr, sym2cpp(src_strides[0]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype,
-                     sym2cpp(copy_shape[1]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, sym2cpp(
-                         copy_shape[0]), self.backend, src_location, dst_location, cudastream), cfg, state_id,
-                    [src_node, dst_node])
+                    'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dst_stride}, {src}, {src_stride}, {width}, {height}, {kind}, {stream}));\n'
+                    .format(
+                        backend=self.backend,
+                        dst=dst_expr,
+                        dst_stride=f'({_topy(dst_strides[1])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})',
+                        src=src_expr,
+                        src_stride=f'({sym2cpp(src_strides[1])}) * sizeof({src_node.desc(sdfg).dtype.ctype})',
+                        width=f'({sym2cpp(copy_shape[0])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})',
+                        height=sym2cpp(copy_shape[1]),
+                        kind=f'{self.backend}Memcpy{src_location}To{dst_location}',
+                        stream=cudastream,
+                    ),
+                    cfg,
+                    state_id,
+                    [src_node, dst_node],
+                )
+            else:
+                raise NotImplementedError("The requested copy operation is not implemented.")
 
             # Post-copy synchronization
             if is_sync:
@@ -1126,7 +1169,6 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St
                 # Obtain copy information
                 copy_shape, src_strides, dst_strides, src_expr, dst_expr = (memlet_copy_to_absolute_strides(
                     self._dispatcher, sdfg, state, edge, src_node, dst_node, self._cpu_codegen._packed_types))
-
                 dims = len(copy_shape)
 
                 funcname = 'dace::%sTo%s%dD' % (_get_storagename(src_storage), _get_storagename(dst_storage), dims)
diff --git a/dace/data.py b/dace/data.py
index 74c1e8b985..3279aff63b 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -210,6 +210,8 @@ def _validate(self):
         if any(not isinstance(s, (int, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic)) for s in self.shape):
             raise TypeError('Shape must be a list or tuple of integer values '
                             'or symbols')
+        if any((shp < 0) == True for shp in self.shape):
+            raise TypeError(f'Found negative shape in Data, its shape was {self.shape}')
         return True
 
     def to_json(self):
@@ -1471,12 +1473,20 @@ def validate(self):
         super(Array, self).validate()
         if len(self.strides) != len(self.shape):
             raise TypeError('Strides must be the same size as shape')
+        if len(self.offset) != len(self.shape):
+            raise TypeError('Offset must be the same size as shape')
 
         if any(not isinstance(s, (int, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic)) for s in self.strides):
             raise TypeError('Strides must be a list or tuple of integer values or symbols')
-
-        if len(self.offset) != len(self.shape):
-            raise TypeError('Offset must be the same size as shape')
+        if any(not isinstance(off, (int, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic))
+               for off in self.offset):
+            raise TypeError('Offset must be a list or tuple of integer values or symbols')
+
+        # Actually it would be enough to only enforce the non negativity only if the shape is larger than one.
+        if any((stride < 0) == True for stride in self.strides):
+            raise TypeError(f'Found negative strides in array, they were {self.strides}')
+        if (self.total_size < 0) == True:
+            raise TypeError(f'The total size of an array must be positive but it was negative {self.total_size}')
 
     def covers_range(self, rng):
         if len(rng) != len(self.shape):
diff --git a/dace/memlet.py b/dace/memlet.py
index 46dac51edf..090a7890fa 100644
--- a/dace/memlet.py
+++ b/dace/memlet.py
@@ -534,6 +534,9 @@ def dst_subset(self, new_dst_subset):
     def validate(self, sdfg, state):
         if self.data is not None and self.data not in sdfg.arrays:
             raise KeyError('Array "%s" not found in SDFG' % self.data)
+        # NOTE: We do not check here is the subsets have a negative size, because such as subset
+        #  is valid, in certain cases, for example if an AccessNode is connected to a MapEntry,
+        #  because the Map is not executed. Thus we do the check in the `validate_state()` function.
 
     def used_symbols(self, all_symbols: bool, edge=None) -> Set[str]:
         """
diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index ccfb0adada..f501697b57 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -9,7 +9,7 @@
 
 import networkx as nx
 
-from dace import dtypes, subsets, symbolic
+from dace import dtypes, subsets, symbolic, data
 from dace.dtypes import DebugInfo
 
 if TYPE_CHECKING:
@@ -656,7 +656,6 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                 )
         ########################################
 
-    # Memlet checks
     for eid, e in enumerate(state.edges()):
         # Reference check
         if id(e) in references:
@@ -680,6 +679,27 @@ def validate_state(state: 'dace.sdfg.SDFGState',
         except Exception as ex:
             raise InvalidSDFGEdgeError("Edge validation failed: " + str(ex), sdfg, state_id, eid)
 
+        # If the edge is a connection between two AccessNodes check if the subset has negative size.
+        # NOTE: We _should_ do this check in `Memlet.validate()` however, this is not possible,
+        #  because the connection between am AccessNode and a MapEntry, with a negative size, is
+        #  legal because, the Map will not run in that case. However, this constellation can not
+        #  be tested for in the Memlet's validation function, so we have to do it here.
+        # NOTE: Zero size is explicitly allowed because it is essentially `memcpy(dst, src, 0)`
+        #  which is save.
+        # TODO: The AN to AN connection is the most obvious one, but it should be extended.
+        if isinstance(e.src, nd.AccessNode) and isinstance(e.dst, nd.AccessNode):
+            e_memlet: dace.Memlet = e.data
+            if e_memlet.subset is not None:
+                if any((ss < 0) == True for ss in e_memlet.subset.size()):
+                    raise InvalidSDFGEdgeError(
+                        f'`subset` of an AccessNode to AccessNode Memlet contains a negative size; the size was {e_memlet.subset.size()}',
+                        sdfg, state_id, eid)
+            if e_memlet.other_subset is not None:
+                if any((ss < 0) == True for ss in e_memlet.other_subset.size()):
+                    raise InvalidSDFGEdgeError(
+                        f'`other_subset` of an AccessNode to AccessNode Memlet contains a negative size; the size was {e_memlet.other_subset.size()}',
+                        sdfg, state_id, eid)
+
         # For every memlet, obtain its full path in the DFG
         path = state.memlet_path(e)
         src_node = path[0].src
diff --git a/tests/codegen/cuda_memcopy_test.py b/tests/codegen/cuda_memcopy_test.py
index 36c5d19f7a..34853f0adb 100644
--- a/tests/codegen/cuda_memcopy_test.py
+++ b/tests/codegen/cuda_memcopy_test.py
@@ -1,8 +1,10 @@
 """ Tests code generation for array copy on GPU target. """
 import dace
 from dace.transformation.auto import auto_optimize
+from dace.sdfg import nodes as dace_nodes
 
 import pytest
+import copy
 import re
 
 # this test requires cupy module
@@ -12,6 +14,237 @@
 rng = cp.random.default_rng(42)
 
 
+def count_node(sdfg: dace.SDFG, node_type):
+    nb_nodes = 0
+    for rsdfg in sdfg.all_sdfgs_recursive():
+        for state in sdfg.states():
+            for node in state.nodes():
+                if isinstance(node, node_type):
+                    nb_nodes += 1
+    return nb_nodes
+
+
+def _make_2d_gpu_copy_sdfg(c_order: bool, ) -> dace.SDFG:
+    """The SDFG performs a copy from the input of the output, that is continuous.
+
+    Essentially the function will generate am SDFG that performs the following
+    operation:
+    ```python
+        B[2:7, 3:9] = A[1:6, 2:8]
+    ```
+    However, two arrays have a shape of `(20, 30)`. This means that this copy
+    can not be expressed as a continuous copy. Regardless which memory order
+    that is used, which can be selected by `c_order`.
+    """
+    sdfg = dace.SDFG(f'gpu_2d_copy_{"corder" if c_order else "forder"}_copy_sdfg')
+    state = sdfg.add_state(is_start_block=True)
+
+    for aname in 'AB':
+        sdfg.add_array(
+            name=aname,
+            shape=(20, 30),
+            dtype=dace.float64,
+            storage=dace.StorageType.GPU_Global,
+            transient=False,
+            strides=((30, 1) if c_order else (1, 20)),
+        )
+
+    state.add_nedge(
+        state.add_access("A"),
+        state.add_access("B"),
+        dace.Memlet("A[1:6, 2:8] -> [2:7, 3:9]"),
+    )
+    sdfg.validate()
+
+    return sdfg
+
+
+def _perform_2d_gpu_copy_test(c_order: bool, ):
+    """Check 2D strided copies are handled by the `Memcpy2D` family.
+    """
+    sdfg = _make_2d_gpu_copy_sdfg(c_order=c_order)
+    assert count_node(sdfg, dace_nodes.AccessNode) == 2
+    assert count_node(sdfg, dace_nodes.MapEntry) == 0
+
+    # Now generate the code.
+    csdfg = sdfg.compile()
+
+    # Ensure that the copy was not turned into a Map
+    assert count_node(csdfg.sdfg, dace_nodes.AccessNode) == 2
+    assert count_node(csdfg.sdfg, dace_nodes.MapEntry) == 0
+
+    # Ensure that the correct call was issued.
+    #  We have to look at the CPU code and not at the GPU.
+    code = sdfg.generate_code()[0].clean_code
+    m = re.search(r'(cuda|hip)Memcpy2DAsync\b', code)
+    assert m is not None
+
+    # Generate input data.
+    ref = {
+        "A": cp.array(cp.random.rand(20, 30), dtype=cp.float64, order="C" if c_order else "F"),
+        "B": cp.array(cp.random.rand(20, 30), dtype=cp.float64, order="C" if c_order else "F"),
+    }
+
+    # We can not use `deepcopy` or `.copy()` because this would set the strides to `C` order.
+    res = {}
+    for name in ref.keys():
+        res[name] = cp.empty_like(ref[name])
+        res[name][:] = ref[name][:]
+
+    exp_strides = (240, 8) if c_order else (8, 160)
+    assert all(v.strides == exp_strides for v in ref.values())
+    assert all(v.strides == exp_strides for v in res.values())
+
+    # Now apply the operation on the reference
+    ref["B"][2:7, 3:9] = ref["A"][1:6, 2:8]
+
+    # Now run the SDFG
+    csdfg(**res)
+
+    assert all(cp.all(ref[k] == res[k]) for k in ref.keys())
+
+
+def _make_1d_gpu_copy(
+    src_row: bool,
+    dst_row: bool,
+) -> dace.SDFG:
+    sdfg = dace.SDFG(f'gpu_1d_copy_{"row" if src_row else "col"}_{"row" if src_row else "col"}_copy_sdfg')
+    state = sdfg.add_state(is_start_block=True)
+
+    for aname in 'AB':
+        sdfg.add_array(
+            name=aname,
+            shape=(20, 20),
+            dtype=dace.float64,
+            storage=dace.StorageType.GPU_Global,
+            transient=False,
+        )
+
+    src_subset = "1, 1:9" if src_row else "1:9, 2"
+    dst_subset = "3, 0:8" if dst_row else "0:8, 4"
+
+    state.add_nedge(
+        state.add_access("A"),
+        state.add_access("B"),
+        dace.Memlet(f"A[{src_subset}] -> [{dst_subset}]"),
+    )
+    sdfg.validate()
+    return sdfg
+
+
+def _perform_1d_gpu_copy(
+    src_row: bool,
+    dst_row: bool,
+):
+    sdfg = _make_1d_gpu_copy(src_row=src_row, dst_row=dst_row)
+    assert count_node(sdfg, dace_nodes.AccessNode) == 2
+    assert count_node(sdfg, dace_nodes.MapEntry) == 0
+
+    # Now generate the code.
+    csdfg = sdfg.compile()
+
+    # Ensure that the copy was not turned into a Map
+    assert count_node(csdfg.sdfg, dace_nodes.AccessNode) == 2
+    assert count_node(csdfg.sdfg, dace_nodes.MapEntry) == 0
+
+    # It will always result in a call to `Memcpy2D` except the source and the destination
+    #  operates on rows, then it is a simple 1D copy.
+    if src_row and dst_row:
+        code = sdfg.generate_code()[0].clean_code
+        m = re.search(r'(cuda|hip)MemcpyAsync\b', code)
+        assert m is not None
+    else:
+        code = sdfg.generate_code()[0].clean_code
+        m = re.search(r'(cuda|hip)Memcpy2DAsync\b', code)
+        assert m is not None
+
+    # Generate input data.
+    ref = {
+        "A": cp.array(cp.random.rand(20, 20), dtype=cp.float64, order="C"),
+        "B": cp.array(cp.random.rand(20, 20), dtype=cp.float64, order="C"),
+    }
+    res = {k: v.copy() for k, v in ref.items()}
+
+    # Now perform the reference operation
+    src_subset = ref["A"][1, 1:9] if src_row else ref["A"][1:9, 2]
+    if dst_row:
+        ref["B"][3, 0:8] = src_subset
+    else:
+        ref["B"][0:8, 4] = src_subset
+
+    # Now run the SDFG
+    csdfg(**res)
+
+    assert all(cp.all(ref[k] == res[k]) for k in ref.keys())
+
+
+def _make_pseudo_1d_copy_sdfg(c_order: bool, ) -> dace.SDFG:
+    """An SDFG that performs a 2D copy that can be turned into a 1d copy.
+    """
+    sdfg = dace.SDFG(f'gpu_pseudo_1d_copy_{"corder" if c_order else "forder"}_sdfg')
+    state = sdfg.add_state(is_start_block=True)
+
+    for aname in 'AB':
+        sdfg.add_array(
+            name=aname,
+            shape=(20, 30),
+            dtype=dace.float64,
+            storage=dace.StorageType.GPU_Global,
+            transient=False,
+            strides=((30, 1) if c_order else (1, 20)),
+        )
+
+    cpy_subset = "1:18, 0:30" if c_order else "0:20, 2:29"
+    state.add_nedge(
+        state.add_access("A"),
+        state.add_access("B"),
+        dace.Memlet(f"A[{cpy_subset}] -> [{cpy_subset}]"),
+    )
+    sdfg.validate()
+
+    return sdfg
+
+
+def _perform_pseudo_1d_copy_test(c_order: bool):
+    sdfg = _make_pseudo_1d_copy_sdfg(c_order=c_order)
+    assert count_node(sdfg, dace_nodes.AccessNode) == 2
+    assert count_node(sdfg, dace_nodes.MapEntry) == 0
+
+    # Now generate the code.
+    csdfg = sdfg.compile()
+
+    # Ensure that the copy was not turned into a Map
+    assert count_node(csdfg.sdfg, dace_nodes.AccessNode) == 2
+    assert count_node(csdfg.sdfg, dace_nodes.MapEntry) == 0
+
+    code = sdfg.generate_code()[0].clean_code
+    m = re.search(r'(cuda|hip)MemcpyAsync\b', code)
+    assert m is not None
+
+    # Generate input data.
+    ref = {
+        "A": cp.array(cp.random.rand(20, 30), dtype=cp.float64, order="C" if c_order else "F"),
+        "B": cp.array(cp.random.rand(20, 30), dtype=cp.float64, order="C" if c_order else "F"),
+    }
+
+    # We can not use `deepcopy` or `.copy()` because this would set the strides to `C` order.
+    res = {}
+    for name in ref.keys():
+        res[name] = cp.empty_like(ref[name])
+        res[name][:] = ref[name][:]
+
+    # Perform the reference computation.
+    if c_order:
+        ref["B"][1:18, 0:30] = ref["A"][1:18, 0:30]
+    else:
+        ref["B"][0:20, 2:29] = ref["A"][0:20, 2:29]
+
+    # Now run the SDFG
+    csdfg(**res)
+
+    assert all(cp.all(ref[k] == res[k]) for k in ref.keys())
+
+
 @pytest.mark.gpu
 def test_gpu_shared_to_global_1D():
     M = 32
@@ -88,6 +321,96 @@ def transpose_and_add_shared_to_global(A: dace.float64[M, N], B: dace.float64[N,
     assert m is not None
 
 
+@pytest.mark.gpu
+def test_gpu_1d_copy():
+    sdfg = dace.SDFG("gpu_1d_copy_sdfg")
+    state = sdfg.add_state(is_start_block=True)
+
+    for aname in 'AB':
+        sdfg.add_array(
+            name=aname,
+            shape=(20, ),
+            dtype=dace.float64,
+            storage=dace.StorageType.GPU_Global,
+            transient=False,
+        )
+    state.add_nedge(
+        state.add_access("A"),
+        state.add_access("B"),
+        dace.Memlet("A[2:13] -> [1:12]"),
+    )
+    sdfg.validate()
+
+    csdfg = sdfg.compile()
+    assert count_node(csdfg.sdfg, dace_nodes.AccessNode) == 2
+    assert count_node(csdfg.sdfg, dace_nodes.MapEntry) == 0
+
+    code = sdfg.generate_code()[0].clean_code
+    m = re.search(r'(cuda|hip)MemcpyAsync\b', code)
+    assert m is not None
+
+    # Now run the sdfg.
+    ref = {
+        "A": cp.array(cp.random.rand(20), dtype=cp.float64),
+        "B": cp.array(cp.random.rand(20), dtype=cp.float64),
+    }
+    res = {k: v.copy() for k, v in ref.items()}
+
+    ref["B"][1:12] = ref["A"][2:13]
+    csdfg(**res)
+
+    assert all(cp.all(ref[k] == res[k]) for k in ref.keys())
+
+
+@pytest.mark.gpu
+def test_2d_c_order_gpu_copy():
+    _perform_2d_gpu_copy_test(c_order=True)
+
+
+@pytest.mark.gpu
+def test_2d_f_order_gpu_copy():
+    _perform_2d_gpu_copy_test(c_order=False)
+
+
+@pytest.mark.gpu
+def test_gpu_1d_copy_row_row():
+    _perform_1d_gpu_copy(src_row=True, dst_row=True)
+
+
+@pytest.mark.gpu
+def test_gpu_1d_copy_row_col():
+    _perform_1d_gpu_copy(src_row=True, dst_row=False)
+
+
+@pytest.mark.gpu
+def test_gpu_1d_copy_col_col():
+    _perform_1d_gpu_copy(src_row=False, dst_row=False)
+
+
+@pytest.mark.gpu
+def test_gpu_1d_copy_col_row():
+    _perform_1d_gpu_copy(src_row=False, dst_row=True)
+
+
+@pytest.mark.gpu
+def test_gpu_pseudo_1d_copy_c_order():
+    _perform_pseudo_1d_copy_test(c_order=True)
+
+
+@pytest.mark.gpu
+def test_gpu_pseudo_1d_copy_f_order():
+    _perform_pseudo_1d_copy_test(c_order=False)
+
+
 if __name__ == '__main__':
     test_gpu_shared_to_global_1D()
     test_gpu_shared_to_global_1D_accumulate()
+    test_2d_c_order_copy()
+    test_2d_f_order_copy()
+    test_gpu_1d_copy_row_row()
+    test_gpu_1d_copy_row_col()
+    test_gpu_1d_copy_col_row()
+    test_gpu_1d_copy_col_col()
+    test_gpu_1d_copy()
+    test_gpu_pseudo_1d_copy_c_order()
+    test_gpu_pseudo_1d_copy_f_order()
diff --git a/tests/sdfg/validation/subset_size_test.py b/tests/sdfg/validation/subset_size_test.py
new file mode 100644
index 0000000000..bc01b85a12
--- /dev/null
+++ b/tests/sdfg/validation/subset_size_test.py
@@ -0,0 +1,83 @@
+from typing import Tuple
+
+import dace
+
+import re
+import pytest
+import numpy as np
+
+
+def _make_sdfg_with_zero_sized_an_to_an_memlet() -> Tuple[dace.SDFG, dace.SDFGState]:
+    """Generates an SDFG that performs a copy that has a zero size.
+    """
+    sdfg = dace.SDFG("zero_size_copy_sdfg")
+    state = sdfg.add_state(is_start_block=True)
+
+    for name in "AB":
+        sdfg.add_array(
+            name=name,
+            shape=(20, 20),
+            dtype=dace.float64,
+            transient=True,
+        )
+
+    state.add_nedge(
+        state.add_access("A"),
+        state.add_access("B"),
+        dace.Memlet("A[2:17, 2:2] -> [2:18, 3:3]"),
+    )
+
+    return sdfg, state
+
+
+def test_an_to_an_memlet_with_zero_size():
+    sdfg, state = _make_sdfg_with_zero_sized_an_to_an_memlet()
+    assert sdfg.number_of_nodes() == 1
+    assert state.number_of_nodes() == 2
+
+    sdfg.validate()
+
+    # This zero sized copy should be considered valid.
+    assert sdfg.is_valid()
+
+    # The SDFG should be a no ops.
+    ref = {
+        "A": np.array(np.random.rand(20, 20), copy=True, order="C", dtype=np.float64),
+        "B": np.array(np.random.rand(20, 20), copy=True, order="C", dtype=np.float64),
+    }
+    res = {k: np.array(v, order="C", copy=True) for k, v in ref.items()}
+
+    csdfg = sdfg.compile()
+    assert csdfg.sdfg.number_of_nodes() == 1
+    assert csdfg.sdfg.states()[0].number_of_nodes() == 2
+    csdfg(**res)
+
+    assert all(np.all(ref[k] == res[k]) for k in ref.keys())
+
+
+def test_an_to_an_memlet_with_negative_size():
+    """Tests if an AccessNode to AccessNode connection leads to an invalid SDFG.
+    """
+    sdfg = dace.SDFG("an_to_an_memlet_with_negative_size")
+    state = sdfg.add_state(is_start_block=True)
+
+    for name in "AB":
+        sdfg.add_array(
+            name=name,
+            shape=(20, 20),
+            dtype=dace.float64,
+            transient=True,
+        )
+
+    state.add_nedge(
+        state.add_access("A"),
+        state.add_access("B"),
+        dace.Memlet("A[2:17, 13:2] -> [2:18, 14:3]"),
+    )
+
+    with pytest.raises(
+            expected_exception=dace.sdfg.InvalidSDFGEdgeError,
+            match=re.escape(
+                f'`subset` of an AccessNode to AccessNode Memlet contains a negative size; the size was [15, -11]'),
+    ):
+        sdfg.validate()

From 8b9b674abfcbd0148a32a87fa6d88e240030435d Mon Sep 17 00:00:00 2001
From: Philip Mueller <philip.mueller@cscs.ch>
Date: Thu, 1 May 2025 12:14:03 +0200
Subject: [PATCH 3/4] Squashed commit of the following:

commit 5197dcf3b0604de750a018204e4a8c9559633362
Author: Edoardo Paone <edoardo.paone@cscs.ch>
Date:   Wed Apr 30 07:43:10 2025 +0200

    fix in cpp codegen

commit c01760ca6d13769e1ce7ea9defc1ef7098e89e1b
Author: Edoardo Paone <edoardo.paone@cscs.ch>
Date:   Wed Apr 30 07:42:53 2025 +0200

    Revert "add default - needed"

    This reverts commit 47d65ac573f210baf5ccaad5a17c85f9c7480ed6.

commit 580f743a2aec8cad4aae24d89768717d16bb7bb1
Author: Edoardo Paone <edoardo.paone@cscs.ch>
Date:   Wed Apr 30 07:42:05 2025 +0200

    Revert "edit"

    This reverts commit a7138d252115a0bf313d212e086d387214dadbc3.

commit a7138d252115a0bf313d212e086d387214dadbc3
Author: Edoardo Paone <edoardo.paone@cscs.ch>
Date:   Wed Apr 30 07:26:14 2025 +0200

    edit

commit 47d65ac573f210baf5ccaad5a17c85f9c7480ed6
Author: Edoardo Paone <edoardo.paone@cscs.ch>
Date:   Tue Apr 29 23:27:55 2025 +0200

    add default - needed

commit a10b5b3d01c7072dddbff02e1569e75630efae17
Author: Edoardo Paone <edoardo.paone@cscs.ch>
Date:   Tue Apr 29 23:07:09 2025 +0200

    minor edit

commit 40395a8c46e0ebf42d0180a88f5471ba1924122c
Author: Edoardo Paone <edoardo.paone@cscs.ch>
Date:   Tue Apr 29 22:42:22 2025 +0200

    use ContextVar for _in_device_code
---
 dace/codegen/targets/cpp.py  |  2 +-
 dace/codegen/targets/cuda.py | 38 ++++++++++++++++++------------------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index 5c4d04c0a7..38e16e72d6 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -257,7 +257,7 @@ def ptr(name: str, desc: data.Data, sdfg: SDFG = None, framecode=None) -> str:
 
         if desc.storage == dtypes.StorageType.CPU_ThreadLocal:  # Use unambiguous name for thread-local arrays
             return f'__{sdfg.cfg_id}_{name}'
-        elif not CUDACodeGen._in_device_code:  # GPU kernels cannot access state
+        elif not CUDACodeGen._in_device_code.get():  # GPU kernels cannot access state
             return f'__state->__{sdfg.cfg_id}_{name}'
         elif (sdfg, name) in framecode.where_allocated and framecode.where_allocated[(sdfg, name)] is not sdfg:
             return f'__{sdfg.cfg_id}_{name}'
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 7ea1b582a6..cbff4954af 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1,4 +1,5 @@
 # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+import contextvars
 import ctypes
 import functools
 import warnings
@@ -60,7 +61,7 @@ class CUDACodeGen(TargetCodeGenerator):
     """ GPU (CUDA/HIP) code generator. """
     target_name = 'cuda'
     title = 'CUDA'
-    _in_device_code = False
+    _in_device_code = contextvars.ContextVar('_in_device_code')
 
     def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG):
         self._frame = frame_codegen
@@ -70,7 +71,7 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG):
         self.create_grid_barrier = False
         self.dynamic_tbmap_type = None
         self.extra_nsdfg_args = []
-        CUDACodeGen._in_device_code = False
+        CUDACodeGen._in_device_code.set(False)
         self._cpu_codegen: Optional['CPUCodeGen'] = None
         self._block_dims = None
         self._grid_dims = None
@@ -454,7 +455,7 @@ def node_dispatch_predicate(self, sdfg, state, node):
         if hasattr(node, 'schedule'):  # NOTE: Works on nodes and scopes
             if node.schedule in dtypes.GPU_SCHEDULES:
                 return True
-        if CUDACodeGen._in_device_code:
+        if CUDACodeGen._in_device_code.get():
             return True
         return False
 
@@ -921,7 +922,7 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St
             raise LookupError('Memlet does not point to any of the nodes')
 
         if (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)
-                and not CUDACodeGen._in_device_code
+                and not CUDACodeGen._in_device_code.get()
                 and (src_storage in [dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned]
                      or dst_storage in [dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned])
                 and not (src_storage in cpu_storage_types and dst_storage in cpu_storage_types)):
@@ -1284,7 +1285,7 @@ def generate_state(self,
                        callsite_stream: CodeIOStream,
                        generate_state_footer: bool = False) -> None:
         # Two modes: device-level state and if this state has active streams
-        if CUDACodeGen._in_device_code:
+        if CUDACodeGen._in_device_code.get():
             self.generate_devicelevel_state(sdfg, cfg, state, function_stream, callsite_stream)
         else:
             # Active streams found. Generate state normally and sync with the
@@ -1509,10 +1510,9 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
                     outer_name = cpp.ptr(node.data, desc, nsdfg, self._frame)
 
                     # Create name from within kernel
-                    oldval = CUDACodeGen._in_device_code
-                    CUDACodeGen._in_device_code = True
+                    token = CUDACodeGen._in_device_code.set(True)
                     inner_name = cpp.ptr(node.data, desc, nsdfg, self._frame)
-                    CUDACodeGen._in_device_code = oldval
+                    CUDACodeGen._in_device_code.reset(token)
 
                     self.extra_nsdfg_args.append((desc.as_arg(name=''), inner_name, outer_name))
                     self._dispatcher.defined_vars.add(inner_name,
@@ -1572,9 +1572,9 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
                     if not defined_type:
                         defined_type, ctype = self._dispatcher.defined_vars.get(ptrname, is_global=is_global)
 
-                    CUDACodeGen._in_device_code = True
+                    token = CUDACodeGen._in_device_code.set(True)
                     inner_ptrname = cpp.ptr(aname, data_desc, sdfg, self._frame)
-                    CUDACodeGen._in_device_code = False
+                    CUDACodeGen._in_device_code.reset(token)
 
                     self._dispatcher.defined_vars.add(inner_ptrname,
                                                       defined_type,
@@ -1591,9 +1591,9 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
                                                        dtypes.AllocationLifetime.Persistent,
                                                        dtypes.AllocationLifetime.External)
                     defined_type, ctype = self._dispatcher.defined_vars.get(ptrname, is_global=is_global)
-                    CUDACodeGen._in_device_code = True
+                    token = CUDACodeGen._in_device_code.set(True)
                     inner_ptrname = cpp.ptr(aname, data_desc, sdfg, self._frame)
-                    CUDACodeGen._in_device_code = False
+                    CUDACodeGen._in_device_code.reset(token)
                     self._dispatcher.defined_vars.add(inner_ptrname, defined_type, ctype, allow_shadowing=True)
 
                     # Rename argument in kernel prototype as necessary
@@ -2101,8 +2101,8 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S
                     self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, tidtype.ctype)
 
         # Dispatch internal code
-        assert CUDACodeGen._in_device_code is False
-        CUDACodeGen._in_device_code = True
+        assert CUDACodeGen._in_device_code.get() is False
+        CUDACodeGen._in_device_code.set(True)
         self._kernel_map = node
         self._kernel_state = cfg.node(state_id)
         self._block_dims = block_dims
@@ -2155,7 +2155,7 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S
         self._block_dims = None
         self._kernel_map = None
         self._kernel_state = None
-        CUDACodeGen._in_device_code = False
+        CUDACodeGen._in_device_code.set(False)
         self._grid_dims = None
         self.dynamic_tbmap_type = None
 
@@ -2180,7 +2180,7 @@ def get_next_scope_entries(self, dfg, scope_entry):
     def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSubgraphView,
                                    state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None:
         # Sanity check
-        assert CUDACodeGen._in_device_code == True
+        assert CUDACodeGen._in_device_code.get() == True
 
         dfg = cfg.state(state_id)
         scope_entry = dfg_scope.source_nodes()[0]
@@ -2610,14 +2610,14 @@ def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphVi
                 gen(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream)
                 return
 
-        if not CUDACodeGen._in_device_code:
+        if not CUDACodeGen._in_device_code.get():
             self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream)
             return
 
         if isinstance(node, nodes.ExitNode):
             self._locals.clear_scope(self._code_state.indentation + 1)
 
-        if CUDACodeGen._in_device_code and isinstance(node, nodes.MapExit):
+        if CUDACodeGen._in_device_code.get() and isinstance(node, nodes.MapExit):
             return  # skip
 
         self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream)
@@ -2739,7 +2739,7 @@ def _generate_condition_from_location(self, name: str, index_expr: str, node: no
     def _generate_Tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
                           node: nodes.Tasklet, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None:
         generated_preamble_scopes = 0
-        if self._in_device_code:
+        if self._in_device_code.get():
             # If location dictionary prescribes that the code should run on a certain group of threads/blocks,
             # add condition
             generated_preamble_scopes += self._generate_condition_from_location('gpu_thread', self._get_thread_id(),

From 23466cd6b80b120e7cdcbb4d11e3766f305d0c2c Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Wed, 7 May 2025 14:16:59 +0200
Subject: [PATCH 4/4] Added compilation flag for line info and removed
 fast_math flag from CUDA compilation flags

---
 dace/config_schema.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/config_schema.yml b/dace/config_schema.yml
index b5a7914018..189931ff3a 100644
--- a/dace/config_schema.yml
+++ b/dace/config_schema.yml
@@ -303,7 +303,7 @@ required:
                         type: str
                         title: nvcc Arguments
                         description: Compiler argument flags for CUDA
-                        default: '-Xcompiler -march=native --use_fast_math -Xcompiler -Wno-unused-parameter'
+                        default: '--generate-line-info -Xcompiler -march=native -Xcompiler -Wno-unused-parameter'
                         default_Windows: '-O3 --use_fast_math'
 
                     hip_args: