diff --git a/.github/workflows/copilot-setup-steps.yml b/.github/workflows/copilot-setup-steps.yml
deleted file mode 100644
index fe60e4867c..0000000000
--- a/.github/workflows/copilot-setup-steps.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: "Copilot Setup Steps"
-on: workflow_dispatch
-
-jobs:
-  copilot-setup-steps:
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-
-      - name: Set up C++ compiler
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y g++ build-essential
-
-      - name: Set up CMake
-        run: |
-          sudo apt-get install -y cmake
-
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.11'
-
-      - name: Install Python dependencies
-        run: |
-          python -m pip install --upgrade pip
-          # Install additional testing dependencies
-          python -m pip install pytest pytest-cov
-          python -m pip install flake8 pytest-xdist coverage
-
-      - name: Install DaCe in development mode
-        run: |
-          python -m pip install --editable ".[testing,linting]"
-          pre-commit install
-          pre-commit run
diff --git a/.github/workflows/fpga-ci.yml b/.github/workflows/fpga-ci.yml
deleted file mode 100644
index 926d4c69e9..0000000000
--- a/.github/workflows/fpga-ci.yml
+++ /dev/null
@@ -1,75 +0,0 @@
-name: FPGA Tests
-
-on:
-  push:
-    branches: [ main, ci-fix ]
-  pull_request:
-    branches: [ main, ci-fix ]
-  merge_group:
-    branches: [ main, ci-fix ]
-
-env:
-  CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-
-concurrency:
-  group: ${{github.workflow}}-${{github.ref}}
-  cancel-in-progress: true
-
-jobs:
-  test-fpga:
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'no-ci') }}
-    runs-on: [self-hosted, linux, intel-fpga, xilinx-fpga]
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: 'recursive'
-    - name: Install dependencies
-      run: |
-        rm -f ~/.dace.conf
-        rm -rf .dacecache tests/.dacecache
-        python -m venv ~/.venv      # create venv so we can use pip
-        source ~/.venv/bin/activate # activate venv
-        python -m pip install --upgrade pip
-        pip install pytest-xdist flake8 coverage click
-        pip uninstall -y dace
-        pip install -e ".[testing]"
-        curl -Os https://uploader.codecov.io/latest/linux/codecov
-        chmod +x codecov
-
-    - name: Run FPGA Tests
-      run: |
-        source ~/.venv/bin/activate # activate venv
-        export COVERAGE_RCFILE=`pwd`/.coveragerc
-
-        # Xilinx setup
-        export PATH=/opt/Xilinx/Vitis/2022.1/bin:/opt/Xilinx/Vitis_HLS/2022.1/bin:/opt/Xilinx/Vivado/2022.1/bin:$PATH
-        export XILINX_XRT=/opt/xilinx/xrt
-        export LD_LIBRARY_PATH=$XILINX_XRT/lib:$LD_LIBRARY_PATH
-        export XILINX_VITIS=/opt/Xilinx/Vitis/2022.1
-        export DACE_compiler_xilinx_platform=xilinx_u250_gen3x16_xdma_4_1_202210_1
-
-        # Intel FPGA setup
-        export INTELFPGAOCLSDKROOT=/opt/intelFPGA_pro/19.1/hld
-        export ALTERAOCLSDKROOT=$INTELFPGAOCLSDKROOT
-        export AOCL_BOARD_PACKAGE_ROOT=/opt/intelFPGA_pro/19.1/hld/board/a10_ref
-        export PATH=$INTELFPGAOCLSDKROOT/bin:$PATH
-        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$AOCL_BOARD_PACKAGE_ROOT/linux64/lib
-        export QUARTUS_ROOTDIR_OVERRIDE=/opt/intelFPGA_pro/19.1/quartus
-        export LD_PRELOAD=/lib/x86_64-linux-gnu/libstdc++.so.6   # Work around dependency issues
-
-        # Due to an internal bug in the Xilinx tools, where the current datetime is passed as an integer
-        # and overflowed in the year 2022, run the FPGA tests pretending like it's January 1st 2021.
-        # faketime -f "@2021-01-01 00:00:00" pytest -n auto --cov-report=xml --cov=dace --tb=short -m "fpga"
-        # Try running without faketime
-        pytest -n auto --cov-report=xml --cov=dace --tb=short --timeout_method thread --timeout=300 -m "fpga"
-
-        coverage report
-        coverage xml
-        reachable=0
-        ping -W 2 -c 1 codecov.io || reachable=$?
-        if [ $reachable -eq 0 ]; then
-          ./codecov
-        else
-          echo "Codecov.io is unreachable"
-        fi
-        killall -9 xsim xsimk || true
diff --git a/.github/workflows/general-ci.yml b/.github/workflows/general-ci.yml
deleted file mode 100644
index 1d9dc3fa79..0000000000
--- a/.github/workflows/general-ci.yml
+++ /dev/null
@@ -1,112 +0,0 @@
-name: General Tests
-
-on:
-  push:
-    branches: [ main, ci-fix ]
-  pull_request:
-    branches: [ main, ci-fix ]
-  merge_group:
-    branches: [ main, ci-fix ]
-
-concurrency:
-  group: ${{github.workflow}}-${{github.ref}}
-  cancel-in-progress: true
-
-jobs:
-  test:
-    if: "!contains(github.event.pull_request.labels.*.name, 'no-ci')"
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ['3.9','3.13']
-        simplify: [0,1,autoopt]
-
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: 'recursive'
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        # Make dependency setup faster
-        echo 'set man-db/auto-update false' | sudo debconf-communicate >/dev/null
-        sudo dpkg-reconfigure man-db
-        # Install dependencies
-        sudo apt-get update
-        sudo apt-get install -y libyaml-dev cmake libblas-dev libopenblas-dev liblapacke-dev libpapi-dev papi-tools
-        pip install flake8 pytest-xdist coverage
-        pip install -e ".[testing]"
-        curl -Os https://uploader.codecov.io/latest/linux/codecov
-        chmod +x codecov
-
-    - name: Test dependencies
-      run: |
-        papi_avail
-
-    - name: Test with pytest
-      run: |
-        export NOSTATUSBAR=1
-        export DACE_testing_serialization=1
-        export DACE_testing_deserialize_exception=1
-        export DACE_cache=unique
-        if [ "${{ matrix.simplify }}" = "autoopt" ]; then
-            export DACE_optimizer_automatic_simplification=1
-            export DACE_optimizer_autooptimize=1
-            echo "Auto-optimization heuristics"
-        else
-            export DACE_optimizer_automatic_simplification=${{ matrix.simplify }}
-        fi
-        pytest -n auto --cov-report=xml --cov=dace --tb=short --timeout_method thread --timeout=300 -m "not gpu and not verilator and not tensorflow and not mkl and not sve and not papi and not mlir and not lapack and not fpga and not mpi and not rtl_hardware and not scalapack and not datainstrument and not long and not sequential"
-        ./codecov
-
-    - name: Test OpenBLAS LAPACK
-      run: |
-        export NOSTATUSBAR=1
-        export DACE_testing_serialization=1
-        export DACE_testing_deserialize_exception=1
-        export DACE_cache=unique
-        if [ "${{ matrix.simplify }}" = "autoopt" ]; then
-            export DACE_optimizer_automatic_simplification=1
-            export DACE_optimizer_autooptimize=1
-            echo "Auto-optimization heuristics"
-        else
-            export DACE_optimizer_automatic_simplification=${{ matrix.simplify }}
-        fi
-        pytest -n 1 --cov-report=xml --cov=dace --tb=short --timeout_method thread --timeout=300 -m "lapack"
-        ./codecov
-
-    - name: Run sequential tests
-      run: |
-        export NOSTATUSBAR=1
-        export DACE_testing_serialization=1
-        export DACE_testing_deserialize_exception=1
-        export DACE_cache=unique
-        if [ "${{ matrix.simplify }}" = "autoopt" ]; then
-            export DACE_optimizer_automatic_simplification=1
-            export DACE_optimizer_autooptimize=1
-            echo "Auto-optimization heuristics"
-        else
-            export DACE_optimizer_automatic_simplification=${{ matrix.simplify }}
-        fi
-        pytest -n 1 --cov-report=xml --cov=dace --tb=short --timeout_method thread --timeout=300 -m "sequential"
-        ./codecov
-
-    - name: Run other tests
-      run: |
-        export NOSTATUSBAR=1
-        export DACE_testing_serialization=0
-        export DACE_testing_deserialize_exception=1
-        export DACE_cache=single
-        export DACE_optimizer_automatic_simplification=${{ matrix.simplify }}
-        export PYTHON_BINARY="coverage run --source=dace --parallel-mode"
-        ./tests/polybench_test.sh
-        ./tests/xform_test.sh
-        coverage combine .; coverage report; coverage xml
-
-    - uses: codecov/codecov-action@v4
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        verbose: true
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
deleted file mode 100644
index d6064cb542..0000000000
--- a/.github/workflows/gpu-ci.yml
+++ /dev/null
@@ -1,76 +0,0 @@
-name: GPU Tests
-
-on:
-  push:
-    branches: [ main, ci-fix ]
-  pull_request:
-    branches: [ main, ci-fix ]
-  merge_group:
-    branches: [ main, ci-fix ]
-
-env:
-  CUDACXX: /usr/local/cuda/bin/nvcc
-  MKLROOT: /opt/intel/oneapi/mkl/latest/
-  CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-
-concurrency:
-  group: ${{github.workflow}}-${{github.ref}}
-  cancel-in-progress: true
-
-jobs:
-  test-gpu:
-    if: "!contains(github.event.pull_request.labels.*.name, 'no-ci')"
-    runs-on: [self-hosted, gpu]
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: 'recursive'
-    - name: Install dependencies
-      run: |
-        rm -f ~/.dace.conf
-        rm -rf .dacecache tests/.dacecache
-        python -m venv ~/.venv      # create venv so we can use pip
-        source ~/.venv/bin/activate # activate venv
-        python -m pip install --upgrade pip
-        pip install flake8 pytest-xdist coverage
-        pip install mpi4py
-        pip install cupy
-        pip uninstall -y dace
-        pip install -e ".[testing]"
-        curl -Os https://uploader.codecov.io/latest/linux/codecov
-        chmod +x codecov
-
-    - name: Test dependencies
-      run: |
-        source ~/.venv/bin/activate # activate venv
-        nvidia-smi
-
-    - name: Run pytest GPU
-      run: |
-        source ~/.venv/bin/activate # activate venv
-        export DACE_cache=single
-        export PATH=$PATH:/usr/local/cuda/bin  # some test is calling cuobjdump, so it needs to be in path
-        echo "CUDACXX: $CUDACXX"
-        pytest --cov-report=xml --cov=dace --tb=short --timeout_method thread --timeout=300 -m "gpu"
-
-    - name: Run extra GPU tests
-      run: |
-        source ~/.venv/bin/activate # activate venv
-        export NOSTATUSBAR=1
-        export DACE_cache=single
-        export COVERAGE_RCFILE=`pwd`/.coveragerc
-        export PYTHON_BINARY="coverage run --source=dace --parallel-mode"
-        ./tests/cuda_test.sh
-
-    - name: Report overall coverage
-      run: |
-        source ~/.venv/bin/activate # activate venv
-        export COVERAGE_RCFILE=`pwd`/.coveragerc
-        coverage combine . */; coverage report; coverage xml
-        reachable=0
-        ping -W 2 -c 1 codecov.io || reachable=$?
-        if [ $reachable -eq 0 ]; then
-          ./codecov
-        else
-          echo "Codecov.io is unreachable"
-        fi
diff --git a/.github/workflows/hardware_test.yml b/.github/workflows/hardware_test.yml
deleted file mode 100644
index 59dc201e4b..0000000000
--- a/.github/workflows/hardware_test.yml
+++ /dev/null
@@ -1,25 +0,0 @@
-name: DaCe RTL hardware emulation
-on: workflow_dispatch
-jobs:
-  test-rtl:
-    runs-on: [self-hosted, linux, xilinx-fpga]
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: 'recursive'
-    - name: Install dependencies
-      run: |
-        rm -f ~/.dace.conf
-        rm -rf .dacecache tests/.dacecache
-        . /opt/setupenv
-        python -m pip install --upgrade pip
-        pip install pytest-xdist flake8
-        pip uninstall -y dace
-        pip install -e ".[testing]"
-
-    - name: Run FPGA Tests
-      run: |
-        # Due to an internal bug in the Xilinx tools, where the current datetime is passed as an integer
-        # and overflowed in the year 2022, run the RTL FPGA tests pretending like it's January 1st 2021.
-        faketime -f "@2021-01-01 00:00:00" pytest -n auto --tb=short -m "rtl_hardware"
-        killall -9 xsim xsimk || true
diff --git a/.github/workflows/heterogeneous-ci.yml b/.github/workflows/heterogeneous-ci.yml
deleted file mode 100644
index 53a8788dce..0000000000
--- a/.github/workflows/heterogeneous-ci.yml
+++ /dev/null
@@ -1,96 +0,0 @@
-name: Heterogeneous Tests
-
-on:
-  push:
-    branches: [ main, ci-fix ]
-  pull_request:
-    branches: [ main, ci-fix ]
-  merge_group:
-    branches: [ main, ci-fix ]
-
-env:
-  CUDA_HOME: /usr/local/cuda
-  CUDACXX: nvcc
-  MKLROOT: /opt/intel/oneapi/mkl/latest/
-  CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-
-concurrency:
-  group: ${{github.workflow}}-${{github.ref}}
-  cancel-in-progress: true
-
-jobs:
-  test-heterogeneous:
-    if: "!contains(github.event.pull_request.labels.*.name, 'no-ci')"
-    runs-on: [self-hosted, linux]
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: 'recursive'
-    - name: Install dependencies
-      run: |
-        rm -f ~/.dace.conf
-        rm -rf .dacecache tests/.dacecache
-        python -m venv ~/.venv      # create venv so we can use pip
-        source ~/.venv/bin/activate # activate venv
-        python -m pip install --upgrade pip
-        pip install flake8 pytest-xdist coverage
-        pip install mpi4py pytest-mpi
-        pip uninstall -y dace
-        pip install -e ".[testing]"
-        curl -Os https://uploader.codecov.io/latest/linux/codecov
-        chmod +x codecov
-
-    - name: Test dependencies
-      run: |
-        papi_avail
-
-    - name: Run parallel pytest
-      run: |
-        source ~/.venv/bin/activate # activate venv
-        export DACE_cache=unique
-        pytest --cov-report=xml --cov=dace --tb=short --timeout_method thread --timeout=300 -m "verilator or mkl or papi or datainstrument"
-
-    - name: Run MPI tests
-      run: |
-        export NOSTATUSBAR=1
-        export DACE_cache=single
-        export COVERAGE_RCFILE=`pwd`/.coveragerc
-        export PYTHON_BINARY="coverage run --source=dace --parallel-mode"
-        source ~/.venv/bin/activate # activate venv
-        ./tests/mpi_test.sh
-
-
-    - name: Test MPI with pytest
-      run: |
-        export NOSTATUSBAR=1
-        export DACE_testing_serialization=1
-        export DACE_testing_deserialize_exception=1
-        export DACE_cache=unique
-        source ~/.venv/bin/activate # activate venv
-        mpirun -n 2 coverage run --source=dace --parallel-mode -m pytest -x --with-mpi --tb=short --timeout_method thread --timeout=300 -m "mpi"
-
-    - name: Test ScaLAPACK PBLAS with pytest
-      run: |
-        export NOSTATUSBAR=1
-        export DACE_testing_serialization=1
-        export DACE_testing_deserialize_exception=1
-        export DACE_cache=unique
-        export DACE_library_pblas_default_implementation=ReferenceOpenMPI
-        source ~/.venv/bin/activate # activate venv
-        for i in {1..4}
-        do
-          mpirun -n "$i" --oversubscribe coverage run --source=dace --parallel-mode -m pytest -x --with-mpi --tb=short --timeout_method thread --timeout=300 -m "scalapack"
-        done
-
-    - name: Report overall coverage
-      run: |
-        export COVERAGE_RCFILE=`pwd`/.coveragerc
-        source ~/.venv/bin/activate # activate venv
-        coverage combine . */; coverage report; coverage xml
-        reachable=0
-        ping -W 2 -c 1 codecov.io || reachable=$?
-        if [ $reachable -eq 0 ]; then
-          ./codecov
-        else
-          echo "Codecov.io is unreachable"
-        fi
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
deleted file mode 100644
index 899c0f2681..0000000000
--- a/.github/workflows/linting.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-name: Code Quality
-
-on:
-  push:
-    branches: [ main, ci-fix ]
-  pull_request:
-    branches: [ main, ci-fix ]
-
-jobs:
-  linting:
-    if: "!contains(github.event.pull_request.labels.*.name, 'no-ci')"
-    name: pre-commit
-    runs-on: ubuntu-latest
-
-    steps:
-    - name: Check repository
-      uses: actions/checkout@v4
-
-    - name: Setup Python 3.9
-      uses: actions/setup-python@v5
-      with:
-        python-version: '3.9'
-        cache: 'pip'
-
-    - name: Install linting tools
-      run: pip install .[linting]
-
-    - name: Run linting tools
-      id: lint
-      continue-on-error: true
-      run: pre-commit run --all-files
-
-    - name: Show git diff
-      if: steps.lint.outcome == 'failure'
-      run: |
-        ./.github/workflows/scripts/show-git-diff.sh
diff --git a/.github/workflows/pyFV3-ci.yml b/.github/workflows/pyFV3-ci.yml
deleted file mode 100644
index 423a6c862f..0000000000
--- a/.github/workflows/pyFV3-ci.yml
+++ /dev/null
@@ -1,105 +0,0 @@
-name: NASA/NOAA pyFV3 repository build test
-
-# Temporarily disabled for main, and instead applied to a specific DaCe v1 maintenance branch (v1/maintenance). Once
-# the FV3 bridge has been adapted to DaCe v1, this will need to be reverted back to apply to main.
-on:
-  push:
-    #branches: [ main, ci-fix ]
-    branches: [ v1/maintenance, ci-fix ]
-  pull_request:
-    #branches: [ main, ci-fix ]
-    branches: [ v1/maintenance, ci-fix ]
-  merge_group:
-    #branches: [ main, ci-fix ]
-    branches: [ v1/maintenance, ci-fix ]
-
-defaults:
-    run:
-      shell: bash
-
-concurrency:
-  group: ${{github.workflow}}-${{github.ref}}
-  cancel-in-progress: true
-
-jobs:
-    build_and_validate_pyFV3:
-      if: "!contains(github.event.pull_request.labels.*.name, 'no-ci')"
-      runs-on: ubuntu-latest
-      strategy:
-          matrix:
-              python-version: [3.11.7]
-
-      steps:
-      - uses: actions/checkout@v4
-        with:
-              repository: 'NOAA-GFDL/PyFV3'
-              ref: 'ci/DaCe'
-              submodules: 'recursive'
-              path: 'pyFV3'
-      - uses: actions/checkout@v4
-        with:
-            path: 'dace'
-            submodules: 'recursive'
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v5
-        with:
-              python-version: ${{ matrix.python-version }}
-      - name: Install library dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y libopenmpi-dev libboost-all-dev
-          gcc --version
-      # Because Github doesn't allow us to do a git checkout in code
-      # we use a trick to checkout DaCe first (not using the external submodule)
-      # install the full suite via requirements_dev, then re-install the correct DaCe
-      - name: Install Python packages
-        run: |
-          python -m pip install --upgrade pip wheel setuptools
-          pip install -e ./pyFV3[develop]
-          pip install -e ./dace
-      - name: Download data
-        run: |
-          cd pyFV3
-          mkdir -p test_data
-          cd test_data
-          wget --retry-connrefused https://portal.nccs.nasa.gov/datashare/astg/smt/pace-regression-data/8.1.3_c12_6ranks_standard.D_SW.tar.gz
-          tar -xzvf 8.1.3_c12_6ranks_standard.D_SW.tar.gz
-          wget --retry-connrefused https://portal.nccs.nasa.gov/datashare/astg/smt/pace-regression-data/8.1.3_c12_6ranks_standard.RiemSolver3.tar.gz
-          tar -xzvf 8.1.3_c12_6ranks_standard.RiemSolver3.tar.gz
-          wget --retry-connrefused https://portal.nccs.nasa.gov/datashare/astg/smt/pace-regression-data/8.1.3_c12_6ranks_standard.Remapping.tar.gz
-          tar -xzvf 8.1.3_c12_6ranks_standard.Remapping.tar.gz
-          cd ../..
-      # Clean up caches between run for stale un-expanded SDFG to trip the build system (NDSL side issue)
-      - name: "Regression test: Riemman Solver on D-grid (RiemSolver3)"
-        env:
-          FV3_DACEMODE: BuildAndRun
-          PACE_CONSTANTS: GFS
-          PACE_LOGLEVEL: Debug
-        run: |
-          pytest -v -s --data_path=./pyFV3/test_data/8.1.3/c12_6ranks_standard/dycore \
-              --backend=dace:cpu --which_modules=Riem_Solver3 \
-              --threshold_overrides_file=./pyFV3/tests/savepoint/translate/overrides/standard.yaml \
-              ./pyFV3/tests/savepoint
-          rm -r ./.gt_cache_FV3_A
-      - name: "Regression test: Shallow water lagrangian dynamics on D-grid (D_SW) (on rank 0 only)"
-        env:
-          FV3_DACEMODE: BuildAndRun
-          PACE_CONSTANTS: GFS
-          PACE_LOGLEVEL: Debug
-        run: |
-          pytest -v -s --data_path=./pyFV3/test_data/8.1.3/c12_6ranks_standard/dycore \
-              --backend=dace:cpu --which_modules=D_SW --which_rank=0 \
-              --threshold_overrides_file=./pyFV3/tests/savepoint/translate/overrides/standard.yaml \
-              ./pyFV3/tests/savepoint
-          rm -r ./.gt_cache_FV3_A
-      - name: "Regression test: Remapping (on rank 0 only)"
-        env:
-          FV3_DACEMODE: BuildAndRun
-          PACE_CONSTANTS: GFS
-          PACE_LOGLEVEL: Debug
-        run: |
-          pytest -v -s --data_path=./pyFV3/test_data/8.1.3/c12_6ranks_standard/dycore \
-              --backend=dace:cpu --which_modules=Remapping --which_rank=0 \
-              --threshold_overrides_file=./pyFV3/tests/savepoint/translate/overrides/standard.yaml \
-              ./pyFV3/tests/savepoint
-          rm -r ./.gt_cache_FV3_A
diff --git a/.github/workflows/release.sh b/.github/workflows/release.sh
deleted file mode 100755
index 7b8ff5f5e4..0000000000
--- a/.github/workflows/release.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/sh
-
-set -e
-
-# Install dependencies
-pip install --upgrade twine
-
-# Synchronize submodules
-git submodule update --init --recursive
-
-# Erase old distribution, if exists
-rm -rf dist dace.egg-info
-
-# Make tarball
-python -m build --sdist
-
-# Upload to PyPI
-twine upload dist/*
diff --git a/.github/workflows/scripts/show-git-diff.sh b/.github/workflows/scripts/show-git-diff.sh
deleted file mode 100755
index a811c01672..0000000000
--- a/.github/workflows/scripts/show-git-diff.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-# Check for uncommitted changes in the working tree
-if [ -n "$(git status --porcelain)" ]; then
-    echo "Linting tools found the following changes are needed to comply"
-    echo "with our automatic styling."
-    echo ""
-    echo "Please run \"pre-commit run --all-files\" locally to fix these."
-    echo "See also https://github.com/spcl/dace/blob/main/CONTRIBUTING.md"
-    echo ""
-    echo "git status"
-    echo "----------"
-    git status
-    echo ""
-    echo "git diff"
-    echo "--------"
-    git --no-pager diff
-    echo ""
-
-    exit 1
-fi
diff --git a/.github/workflows/verilator_compatibility.yml b/.github/workflows/verilator_compatibility.yml
deleted file mode 100644
index dce0c9b1fb..0000000000
--- a/.github/workflows/verilator_compatibility.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-name: DaCe Verilator Compatibility Check
-on:
-  workflow_dispatch:
-    inputs:
-      reason:
-        description: 'Reason for the trigger'
-        required: true
-        default: 'Check compatibility'
-  schedule:
-    - cron: '0 0 1 * *' # monthly
-jobs:
-  build:
-    strategy:
-      matrix:
-        verilator_version: ['4.028', '4.034', '4.036', '4.100', 'master']
-    runs-on: ubuntu-20.04
-    steps:
-      - name: trigger reason
-        run: echo "Trigger Reason:" ${{ github.event.inputs.reason }}
-      - uses: actions/checkout@v4
-      - name: checkout submodules
-        run: git submodule update --init --recursive
-      - name: install apt packages
-        run: sudo apt-get update && sudo apt-get -y install git make autoconf g++ flex bison libfl2 libfl-dev
-      - name: compile verilator
-        run: git clone https://github.com/verilator/verilator.git && cd verilator && git fetch origin && if [ ! "${{ matrix.verilator_version }}" == "master" ]; then  git checkout v${{ matrix.verilator_version }}; fi && autoconf && ./configure && make -j2 && sudo make install
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.8'
-          architecture: 'x64'
-      - uses: BSFishy/pip-action@v1
-        with:
-          packages: pytest
-          requirements: requirements.txt
-      - name: install dace
-        run: python3 -m pip install .
-      - run: pytest -m "verilator"
diff --git a/dace/builtin_hooks.py b/dace/builtin_hooks.py
index 2a5b49e983..b691cd0296 100644
--- a/dace/builtin_hooks.py
+++ b/dace/builtin_hooks.py
@@ -96,7 +96,8 @@ def _make_filter_function(filter: Optional[Union[str, Callable[[Any], bool]]],
     if isinstance(filter, str):
         # If a string was given, construct predicate based on wildcard name matching
         if with_attr:
-            filter_func = lambda elem: fnmatch.fnmatch(elem.name, filter)
+            filter_func = lambda elem: fnmatch.fnmatch(elem.name, filter) if hasattr(elem, 'name') else fnmatch.fnmatch(
+                elem.label, filter)
         else:
             filter_func = lambda elem: fnmatch.fnmatch(elem, filter)
     elif callable(filter):
diff --git a/dace/codegen/CMakeLists.txt b/dace/codegen/CMakeLists.txt
index e1a5e33947..4d2f2ef506 100644
--- a/dace/codegen/CMakeLists.txt
+++ b/dace/codegen/CMakeLists.txt
@@ -141,7 +141,7 @@ if(DACE_ENABLE_CUDA)
 
   set(CMAKE_CUDA_ARCHITECTURES "${LOCAL_CUDA_ARCHITECTURES}")
   enable_language(CUDA)
-  list(APPEND DACE_LIBS CUDA::cudart)
+  list(APPEND DACE_LIBS CUDA::cudart CUDA::nvtx3)
   add_definitions(-DWITH_CUDA)
 
   if (MSVC_IDE)
@@ -167,6 +167,21 @@ if(DACE_ENABLE_HIP)
 
   # Add libraries such as rocBLAS
   link_directories(${HIP_PATH}/../lib)
+  if(ROCM_PATH)
+    find_path(ROCTX_INCLUDE_DIR roctx.h HINTS ${ROCM_PATH}/include/roctracer ${ROCM_PATH}/roctracer/include)
+    if(NOT ROCTX_INCLUDE_DIR)
+      message(WARNING "Could not find roctx.h in ${ROCM_PATH}/include/roctracer or ${ROCM_PATH}/roctracer/include")
+    endif()
+  endif()
+  if(ROCM_PATH AND ROCTX_INCLUDE_DIR)
+    find_path(ROCTX_LIBRARY_DIR "libroctx64.so" HINTS ${ROCM_PATH}/lib)
+    if(NOT ROCTX_LIBRARY_DIR)
+      message(WARNING "Could not find libroctx64.so in ${ROCM_PATH}/lib")
+    else()
+      list(APPEND DACE_LIBS "-lroctx64 -L${ROCTX_LIBRARY_DIR}")
+      include_directories(SYSTEM ${ROCTX_INCLUDE_DIR})
+    endif()
+  endif()
 endif()
 
 # Function for performing deferred variable expansion
diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py
index 733f0ba53c..4ba47ee326 100644
--- a/dace/codegen/compiled_sdfg.py
+++ b/dace/codegen/compiled_sdfg.py
@@ -5,10 +5,11 @@
 import re
 import shutil
 import subprocess
-from typing import Any, Callable, Dict, List, Tuple, Optional, Type, Union
+from typing import Any, Callable, Dict, List, Tuple, Optional, Type, Union, Sequence
 import warnings
 import tempfile
 import pickle
+import pathlib
 import sys
 
 import numpy as np
@@ -77,7 +78,8 @@ def is_loaded(self) -> bool:
             lib_cfilename = ctypes.c_wchar_p(self._library_filename)
         else:
             # As UTF-8
-            lib_cfilename = ctypes.c_char_p(self._library_filename.encode('utf-8'))
+            tt = self._library_filename.encode('utf-8')
+            lib_cfilename = ctypes.c_char_p(tt)
 
         return self._stub.is_library_loaded(lib_cfilename) == 1
 
@@ -96,21 +98,39 @@ def load(self):
         # Check if library is already loaded
         is_loaded = True
         lib_cfilename = None
+        lib_filename = self._library_filename
+        counter = 0
         while is_loaded:
             # Convert library filename to string according to OS
             if os.name == 'nt':
                 # As UTF-16
-                lib_cfilename = ctypes.c_wchar_p(self._library_filename)
+                lib_cfilename = ctypes.c_wchar_p(lib_filename)
             else:
                 # As UTF-8
-                lib_cfilename = ctypes.c_char_p(self._library_filename.encode('utf-8'))
+                lib_cfilename = ctypes.c_char_p(lib_filename.encode('utf-8'))
 
+            # Test if the library is loaded.
             is_loaded = self._stub.is_library_loaded(lib_cfilename)
+
             if is_loaded == 1:
                 warnings.warn(f'Library {self._library_filename} already loaded, renaming file')
+
+                # The library is loaded, copy the _original_ library file to a new file
+                #  and then try to load that. We only do the copy if the new new name is
+                #  free. It seems that at least on LINUX there is some issue if we
+                #  overwrite a file that already exists.
+                lib_filename = self._library_filename + f'_{counter}'
+                counter += 1
+                if pathlib.Path(lib_filename).exists():
+                    assert pathlib.Path(lib_filename).is_file()
+                    continue
+
+                # The file name is not taken, so make a copy. There might be a race condition
+                #  here in the presence of multiple processes.
+                # TODO: Investigate if we should switch to hardlinks if they are supported.
                 try:
-                    shutil.copyfile(self._library_filename, self._library_filename + '_')
-                    self._library_filename += '_'
+                    assert self._library_filename != lib_filename
+                    shutil.copyfile(self._library_filename, lib_filename)
                 except shutil.Error:
                     raise cgx.DuplicateDLLError(f'Library {os.path.basename(self._library_filename)}'
                                                 'is already loaded somewhere else and cannot be unloaded. '
@@ -118,6 +138,7 @@ def load(self):
 
         # Actually load the library
         self._lib = ctypes.c_void_p(self._stub.load_library(lib_cfilename))
+        self._library_filename = lib_filename
 
         if self._lib.value is None:
             # Try to understand why the library is not loading, if dynamic
@@ -147,12 +168,38 @@ def __enter__(self, *args, **kwargs):
     def __exit__(self, *args, **kwargs):
         self.unload()
 
+    def __copy__(self):
+        raise RuntimeError(f'Can not copy ReloadableDLL({self._library_filename})')
+
+    def __deepcopy__(self, memodict={}):
+        raise RuntimeError(f'Can not copy ReloadableDLL({self._library_filename})')
+
 
 class CompiledSDFG(object):
     """ A compiled SDFG object that can be called through Python.
 
-    Todo:
-        Scalar return values are not handled properly, this is a code gen issue.
+    Essentially this class makes an SDFG callable. Normally a user will not create it
+    directly but instead it is generated by some utilities such as `SDFG.compile()`.
+
+    The class performs the following tasks:
+    - It ensures that the SDFG object is properly initialized, either by a direct
+        call to `initialize()` or the first time it is called. Furthermore, it will
+        also take care of the finalization if it does out of scope.
+    - It transforms Python arguments into C arguments.
+
+    Technically there are two ways how the SDFG can be called, the first is using
+    `__call__()`, i.e. as a normal function. However, this will always processes
+    the arguments and does some error checking and is thus slow. The second way
+    is the advanced interface, which allows to decompose the calling into different
+    subset. For more information see `construct_arguments()`, `fast_call()` and
+    `convert_return_values()`.
+
+    :note: In previous version the arrays used as return values were sometimes reused.
+        However, this was changed and every time `construct_arguments()` is called
+        new arrays are allocated.
+    :note: It is not possible to return scalars. Note that currently using scalars
+        as return values is a validation error. The only exception are (probably)
+        Python objects.
     """
 
     def __init__(self, sdfg, lib: ReloadableDLL, argnames: List[str] = None):
@@ -161,9 +208,14 @@ def __init__(self, sdfg, lib: ReloadableDLL, argnames: List[str] = None):
         self._lib = lib
         self._initialized = False
         self._libhandle = ctypes.c_void_p(0)
-        self._lastargs = ()
         self.do_not_execute = False
 
+        # Contains the pointer arguments that where used to call the SDFG, `__call__()`
+        #  was used. It is also used by `get_workspace_size()`.
+        # NOTE: Using its content might be dangerous as only the pointers to arrays are
+        #   stored. It is the users responsibility to ensure that they are valid.
+        self._lastargs = None
+
         lib.load()  # Explicitly load the library
         self._init = lib.get_symbol('__dace_init_{}'.format(sdfg.name))
         self._init.restype = ctypes.c_void_p
@@ -172,17 +224,27 @@ def __init__(self, sdfg, lib: ReloadableDLL, argnames: List[str] = None):
         self._cfunc = lib.get_symbol('__program_{}'.format(sdfg.name))
 
         # Cache SDFG return values
-        self._create_new_arrays: bool = True
         self._return_syms: Dict[str, Any] = None
+        # It will contain the shape of the array or the name if the return array is passed as argument.
         self._retarray_shapes: List[Tuple[str, np.dtype, dtypes.StorageType, Tuple[int], Tuple[int], int]] = []
-        self._retarray_is_scalar: List[bool] = []
+        # Is only `True` if teh return value is a scalar _and_ a `pyobject`.
+        self._retarray_is_pyobject: List[bool] = []
         self._return_arrays: List[np.ndarray] = []
         self._callback_retval_references: List[Any] = []  # Avoids garbage-collecting callback return values
 
+        # If there are return values then this is `True` it is is a single value. Note that
+        #  `False` either means that a tuple is returned or there are no return values.
+        # NOTE: Needed to handle the case of a tuple with one element.
+        self._is_single_value_ret: bool = False
+        if '__return' in self._sdfg.arrays:
+            assert not any(aname.startswith('__return_') for aname in self._sdfg.arrays.keys())
+            self._is_single_value_ret = True
+
         # Cache SDFG argument properties
         self._typedict = self._sdfg.arglist()
         self._sig = self._sdfg.signature_arglist(with_types=False, arglist=self._typedict)
         self._free_symbols = self._sdfg.free_symbols
+        self._constants = self._sdfg.constants
         self.argnames = argnames
 
         if self.argnames is None and len(sdfg.arg_names) != 0:
@@ -269,12 +331,21 @@ def get_workspace_sizes(self) -> Dict[dtypes.StorageType, int]:
         """
         Returns the total external memory size to be allocated for this SDFG.
 
+        Note that the function queries the sizes of the last call that was made by
+        `__call__()` or `initialize()`. Calls made by `fast_call()` or `safe_call()`
+        will not be considered.
+
         :return: A dictionary mapping storage types to the number of bytes necessary
                  to allocate for the SDFG to work properly.
+        :note: It is the users responsibility that all arguments, especially the array
+            arguments, remain valid between the call to `__call__()` or `initialize()`
+            and the call to this function.
         """
         if not self._initialized:
             raise ValueError('Compiled SDFG is uninitialized, please call ``initialize`` prior to '
                              'querying external memory size.')
+        if self._lastargs is None:
+            raise ValueError('To use `get_workspace_sizes()` `__call__()` or `initialize()` must be called before.')
 
         result: Dict[dtypes.StorageType, int] = {}
         for storage in self.external_memory_types:
@@ -288,15 +359,24 @@ def set_workspace(self, storage: dtypes.StorageType, workspace: Any):
         """
         Sets the workspace for the given storage type to the given buffer.
 
+        Note that the function queries the sizes of the last call that was made by
+        `__call__()` or `initialize()`. Calls made by `fast_call()` or `safe_call()`
+        will not be considered.
+
         :param storage: The storage type to fill.
         :param workspace: An array-convertible object (through ``__[cuda_]array_interface__``,
                           see ``array_interface_ptr``) to use for the workspace.
+        :note: It is the users responsibility that all arguments, especially the array
+            arguments, remain valid between the call to `__call__()` or `initialize()`
+            and the call to this function.
         """
         if not self._initialized:
             raise ValueError('Compiled SDFG is uninitialized, please call ``initialize`` prior to '
                              'setting external memory.')
         if storage not in self.external_memory_types:
             raise ValueError(f'Compiled SDFG does not specify external memory of {storage}')
+        if self._lastargs is None:
+            raise ValueError('To use `get_workspace_sizes()` `__call__()` or `initialize()` must be called before.')
 
         func = self._lib.get_symbol(f'__dace_set_external_memory_{storage.name}', None)
         ptr = dtypes.array_interface_ptr(workspace, storage)
@@ -331,12 +411,13 @@ def initialize(self, *args, **kwargs):
         if self._initialized:
             return
 
-        if len(args) > 0 and self.argnames is not None:
-            kwargs.update({aname: arg for aname, arg in zip(self.argnames, args)})
-
         # Construct arguments in the exported C function order
-        _, initargtuple = self._construct_args(kwargs)
+        callargtuple, initargtuple = self.construct_arguments(*args, **kwargs)
         self._initialize(initargtuple)
+
+        # The main reason for setting `_lastargs` here is, to allow calls to `get_workspace_size()`.
+        self._lastargs = (callargtuple, initargtuple)
+
         return self._libhandle
 
     def finalize(self):
@@ -361,38 +442,34 @@ def __call__(self, *args, **kwargs):
         """
         Forwards the Python call to the compiled ``SDFG``.
 
-        The order of the positional arguments is expected to be the same as in
-        the ``argnames`` member. The function will roughly perform the
-        following tasks:
-        - Change the order of the Python arguments into the one required by
-          the binary.
-        - Performing some basic sanity checks.
-        - Transforming the Python arguments into their ``C`` equivalents.
-        - Allocate the memory for the return values.
-        - Call the ``C` function.
+        The order of the positional arguments is expected to be the same as in the
+        ``argnames`` member. The function will perform the following tasks:
+        - Calling ``construct_arguments()`` and creating the argument vector and
+            allocating the memory for the return values.
+        - Performing the actual call by means of ``fast_call()``, with enabled error
+            checks.
+        - Then it will convert the return value into the expected format by means of
+            ``convert_return_values()`` and return that value.
 
         :note: The memory for the return values is only allocated the first
                time this function is called. Thus, this function will always
                return the same objects. To force the allocation of new memory
                you can call ``clear_return_values()`` in advance.
         """
-        if self.argnames is None and len(args) != 0:
-            raise KeyError(f"Passed positional arguments to an SDFG that does not accept them.")
-        elif len(args) > 0 and self.argnames is not None:
-            kwargs.update(
-                # `_construct_args` will handle all of its arguments as kwargs.
-                {
-                    aname: arg
-                    for aname, arg in zip(self.argnames, args)
-                })
-        argtuple, initargtuple = self._construct_args(kwargs)  # Missing arguments will be detected here.
-        # Return values are cached in `self._lastargs`.
-        return self.fast_call(argtuple, initargtuple, do_gpu_check=True)
+        argtuple, initargtuple = self.construct_arguments(*args, **kwargs)  # Missing arguments will be detected here.
+        self._lastargs = (argtuple, initargtuple)
+        self.fast_call(argtuple, initargtuple, do_gpu_check=True)
+        return self.convert_return_values()
 
     def safe_call(self, *args, **kwargs):
         """
         Forwards the Python call to the compiled ``SDFG`` in a separate process to avoid crashes in the main process. Raises an exception if the SDFG execution fails.
+
+        Note the current implementation lacks the proper handling of return values.
+        Thus output can only be transmitted through inout arguments.
         """
+        if any(aname == '__return' or aname.startswith('__return_') for aname in self.sdfg.arrays.keys()):
+            raise NotImplementedError('`CompiledSDFG.safe_call()` does not support return values.')
 
         # Pickle the SDFG and arguments
         with tempfile.NamedTemporaryFile(mode='wb', delete=False) as f:
@@ -444,24 +521,25 @@ def safe_call(self, *args, **kwargs):
 
     def fast_call(
         self,
-        callargs: Tuple[Any, ...],
-        initargs: Tuple[Any, ...],
+        callargs: Sequence[Any],
+        initargs: Sequence[Any],
         do_gpu_check: bool = False,
-    ) -> Union[Tuple[Any, ...], Any]:
+    ) -> None:
         """
-        Calls the underlying binary functions directly and bypassing
-        argument sanitation.
+        Calls the underlying binary functions directly and bypassing argument sanitation.
 
-        This is a faster, but less user friendly version of ``__call__()``.
-        While ``__call__()`` will transforms its Python arguments such that
-        they can be forwarded, this function assumes that this processing
-        was already done by the user.
+        This is a faster, but less user friendly version of ``__call__()``. While
+        ``__call__()`` will transforms its Python arguments such that they can be
+        forwarded and allocate memory for the return values, this function assumes
+        that this processing was already done by the user.
+        To build the argument vectors you should use `self.construct_arguments()`.
 
         :param callargs:        Arguments passed to the actual computation.
         :param initargs:        Arguments passed to the initialization function.
         :param do_gpu_check:    Check if errors happened on the GPU.
 
-        :note: You may use `_construct_args()` to generate the processed arguments.
+        :note: This is an advanced interface.
+        :note: In previous versions this function also called `convert_return_values()`.
         """
         try:
             # Call initializer function if necessary, then SDFG
@@ -485,8 +563,7 @@ def fast_call(
                 if lasterror is not None:
                     raise RuntimeError(
                         f'An error was detected when calling "{self._sdfg.name}": {self._get_error_text(lasterror)}')
-
-            return self._convert_return_values()
+            return
         except (RuntimeError, TypeError, UnboundLocalError, KeyError, cgx.DuplicateDLLError, ReferenceError):
             self._lib.unload()
             raise
@@ -498,18 +575,40 @@ def __del__(self):
             self._libhandle = ctypes.c_void_p(0)
         self._lib.unload()
 
-    def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]:
-        """
-        Main function that controls argument construction for calling
-        the C prototype of the SDFG.
+    def construct_arguments(self, *args: Any, **kwargs: Any) -> Tuple[Tuple[Any], Tuple[Any]]:
+        """Construct the argument vectors suitable for from its argument.
+
+        The function returns a pair of tuple, that are suitable for `fast_call()`.
+        The first element of is `callargs`, i.e. the full arguments, while the
+        second element is `initargs`, which is only used/needed the first time
+        an SDFG is called.
+
+        It is important that this function will also allocate new return values.
+        The array objects are managed by `self` and remain valid until this
+        function is called again. However, they are also returned by `self.__call__()`.
 
-        Organizes arguments first by ``sdfg.arglist``, then data descriptors
-        by alphabetical order, then symbols by alphabetical order.
+        It is also possible to pass the array, that should be used to return a value,
+        directly as argument. In that case the allocation for that return value will
+        be skipped.
 
-        :note: If not initialized this function will initialize the memory for
-               the return values, however, it might also reallocate said memory.
-        :note: This function will also update the internal argument cache.
+        :note: In case of arrays, the returned argument vectors only contains the
+            pointers to the underlying memory. Thus it is the user's responsibility
+            to ensure that the memory remains allocated until the argument vector
+            is used.
+        :note: This is an advanced interface.
         """
+        if self.argnames is None and len(args) != 0:
+            raise KeyError(f"Passed positional arguments to an SDFG that does not accept them.")
+        elif len(args) > 0 and self.argnames is not None:
+            positional_arguments = {aname: avalue for aname, avalue in zip(self.argnames, args)}
+            if not positional_arguments.keys().isdisjoint(kwargs.keys()):
+                raise ValueError(
+                    f'The arguments where passed once as positional and named arguments: {set(positional_arguments.keys()).intersection(kwargs.keys())}'
+                )
+            kwargs.update(positional_arguments)
+
+        # NOTE: This might invalidate the elements associated to the return values of
+        #   all argument vectors that were created before.
         self._initialize_return_values(kwargs)
 
         # Add the return values to the arguments, since they are part of the C signature.
@@ -539,31 +638,51 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]:
             argnames = []
             sig = []
 
-        # Type checking
-        cargs = []
         no_view_arguments = not Config.get_bool('compiler', 'allow_view_arguments')
-        for i, (a, arg, atype) in enumerate(zip(argnames, arglist, argtypes)):
-            carg = dt.make_ctypes_argument(arg,
-                                           atype,
-                                           a,
-                                           allow_views=not no_view_arguments,
-                                           symbols=kwargs,
-                                           callback_retval_references=self._callback_retval_references)
-            cargs.append(carg)
-
-        constants = self.sdfg.constants
+        cargs = tuple(
+            dt.make_ctypes_argument(aval,
+                                    atype,
+                                    aname,
+                                    allow_views=not no_view_arguments,
+                                    symbols=kwargs,
+                                    callback_retval_references=self._callback_retval_references)
+            for aval, atype, aname in zip(arglist, argtypes, argnames))
+
         symbols = self._free_symbols
         callparams = tuple((carg, aname) for arg, carg, aname in zip(arglist, cargs, argnames)
-                           if not (symbolic.issymbolic(arg) and (hasattr(arg, 'name') and arg.name in constants)))
-
-        newargs = tuple(carg for carg, aname in callparams)
+                           if not ((hasattr(arg, 'name') and arg.name in self._constants) and symbolic.issymbolic(arg)))
+        newargs = tuple(carg for carg, _aname in callparams)
         initargs = tuple(carg for carg, aname in callparams if aname in symbols)
 
-        self._lastargs = newargs, initargs
-        return self._lastargs
+        return (newargs, initargs)
+
+    def convert_return_values(self) -> Union[Any, Tuple[Any, ...]]:
+        """Convert the return arguments.
+
+        Execute the `return` statement and return. This function should only be called
+        after `fast_call()` has been run.
+        Keep in mid that it is not possible to return scalars (with the exception of
+        `pyobject`s), they will be always returned as an array with shape `(1,)`.
+
+        :note: This is an advanced interface.
+        :note: After `fast_call()` returns it is only allowed to call this function once.
+        """
+        # TODO: Make sure that the function is called only once by checking it.
+        # NOTE: Currently it is not possible to return a scalar value, see `tests/sdfg/scalar_return.py`
+        if not self._return_arrays:
+            return None
+        elif self._is_single_value_ret:
+            assert len(self._return_arrays) == 1
+            return self._return_arrays[0].item() if self._retarray_is_pyobject[0] else self._return_arrays[0]
+        else:
+            return tuple(r.item() if is_pyobj else r
+                         for r, is_pyobj in zip(self._return_arrays, self._retarray_is_pyobject))
 
     def clear_return_values(self):
-        self._create_new_arrays = True
+        warnings.warn(
+            'The "CompiledSDFG.clear_return_values" API is deprecated, as this behaviour has'
+            ' become the new default, and is a noops.', DeprecationWarning)
+        pass
 
     def _create_array(self, _: str, dtype: np.dtype, storage: dtypes.StorageType, shape: Tuple[int],
                       strides: Tuple[int], total_size: int):
@@ -599,52 +718,76 @@ def _initialize_return_values(self, kwargs):
         # Clear references from last call (allow garbage collection)
         self._callback_retval_references.clear()
 
-        if self._initialized:
-            if self._return_syms == syms:
-                if not self._create_new_arrays:
-                    return
-                else:
-                    self._create_new_arrays = False
-                    # Use stored sizes to recreate arrays (fast path)
-                    self._return_arrays = tuple(kwargs[desc[0]] if desc[0] in kwargs else self._create_array(*desc)
-                                                for desc in self._retarray_shapes)
-                    return
+        if self._initialized and self._return_syms == syms:
+            # Use stored sizes to recreate arrays (fast path)
+            self._return_arrays = tuple(kwargs[desc[0]] if desc[0] in kwargs else self._create_array(*desc)
+                                        for desc in self._retarray_shapes)
+            return
 
         self._return_syms = syms
-        self._create_new_arrays = False
-
-        # Initialize return values with numpy arrays
-        self._retarray_shapes = []
         self._return_arrays = []
+        self._retarray_shapes = []
+        self._retarray_is_pyobject = []
         for arrname, arr in sorted(self.sdfg.arrays.items()):
-            if arrname.startswith('__return') and not arr.transient:
-                if arrname in kwargs:
+            if arrname.startswith('__return'):
+                if arr.transient:
+                    raise ValueError(f'Used the special array name "{arrname}" as transient.')
+
+                elif arrname in kwargs:
+                    # The return value is passed as an argument, in that case store the name in `self._retarray_shapes`.
+                    warnings.warn(f'Return value "{arrname}" is passed as a regular argument.', stacklevel=2)
                     self._return_arrays.append(kwargs[arrname])
-                    self._retarray_is_scalar.append(isinstance(arr, dt.Scalar))
                     self._retarray_shapes.append((arrname, ))
-                    continue
 
-                if isinstance(arr, dt.Stream):
+                elif isinstance(arr, dt.Stream):
                     raise NotImplementedError('Return streams are unsupported')
 
-                shape = tuple(symbolic.evaluate(s, syms) for s in arr.shape)
-                dtype = arr.dtype.as_numpy_dtype()
-                total_size = int(symbolic.evaluate(arr.total_size, syms))
-                strides = tuple(symbolic.evaluate(s, syms) * arr.dtype.bytes for s in arr.strides)
-                shape_desc = (arrname, dtype, arr.storage, shape, strides, total_size)
-                self._retarray_is_scalar.append(isinstance(arr, dt.Scalar) or isinstance(arr.dtype, dtypes.pyobject))
-                self._retarray_shapes.append(shape_desc)
-
-                # Create an array with the properties of the SDFG array
-                arr = self._create_array(*shape_desc)
-                self._return_arrays.append(arr)
+                else:
+                    shape = tuple(symbolic.evaluate(s, syms) for s in arr.shape)
+                    dtype = arr.dtype.as_numpy_dtype()
+                    total_size = int(symbolic.evaluate(arr.total_size, syms))
+                    strides = tuple(symbolic.evaluate(s, syms) * arr.dtype.bytes for s in arr.strides)
+                    shape_desc = (arrname, dtype, arr.storage, shape, strides, total_size)
+                    self._retarray_shapes.append(shape_desc)
+
+                    # Create an array with the properties of the SDFG array
+                    return_array = self._create_array(*shape_desc)
+                    self._return_arrays.append(return_array)
+
+                # BUG COMPATIBILITY(PR#2206):
+                #   In the original version `_retarray_is_pyobject` was named `_retarray_is_scalar`, however
+                #   since scalars could not be returned on an [implementation level](https://github.com/spcl/dace/pull/1609)
+                #   it was essentially useless. But was used for `pyobject` in _some_ cases. And indeed,
+                #   since `pyobject`s are essentially `void` pointers is was, in principle possible, to return/pass
+                #   them as "scalars", read "not inside an array".
+                #   However, if the return value was passed as argument, i.e. the first `elif`, then it
+                #   was ignored if `arr` was a `pyobject`. Only if the return value was managed by `self`,
+                #   i.e. the `else` case, then it was considered, in a way at least. The problem was, that it was
+                #   done using the following check:
+                #       `isinstance(arr, dt.Scalar) or isinstance(arr.dtype, dtypes.pyobject)`
+                #   Because of the `or` that is used, _everything_ whose `dtype` is `pyobject` was classified
+                #   as a scalar `pyobject`, i.e. one element, even if it was in fact an array of millions of `pyobject`s.
+                #   The correct behaviour would be to change the `or` to an `and` but then several unit
+                #   tests (`test_pyobject_return`, `test_pyobject_return_tuple` and `test_nested_autoparse[False]`
+                #   in `tests/python_frontend/callee_autodetect_test.py`) will fail.
+                #   The following code is bug compatible and also allows to pass a `pyobject` directly, i.e.
+                #   through `kwargs`.
+                if isinstance(arr.dtype, dtypes.pyobject):
+                    if isinstance(arr, dt.Scalar):
+                        # Proper scalar.
+                        self._retarray_is_pyobject.append(True)
+                    elif isinstance(arr, dt.Array):
+                        # An array, let's check if it is just a wrapper for a single value.
+                        if not (len(arr.shape) == 1 and arr.shape[0] == 1):
+                            warnings.warn(f'Decay an array of `pyobject`s with shape {arr.shape} to a single one.',
+                                          stacklevel=2)
+                        self._retarray_is_pyobject.append(True)
+                    else:
+                        raise ValueError(
+                            f'Does not know how to handle "{arrname}", which is a {type(arr).__name__} of `pyobject`.')
+                else:
+                    self._retarray_is_pyobject.append(False)
 
-    def _convert_return_values(self):
-        # Return the values as they would be from a Python function
-        # NOTE: Currently it is not possible to return a scalar value, see `tests/sdfg/scalar_return.py`
-        if not self._return_arrays:
-            return None
-        elif len(self._return_arrays) == 1:
-            return self._return_arrays[0].item() if self._retarray_is_scalar[0] else self._return_arrays[0]
-        else:
-            return tuple(r.item() if scalar else r for r, scalar in zip(self._return_arrays, self._retarray_is_scalar))
+        assert (not self._is_single_value_ret) or (len(self._return_arrays) == 1)
+        assert len(self._return_arrays) == len(self._retarray_shapes) == len(self._retarray_is_pyobject)
+        self._return_arrays = tuple(self._return_arrays)
diff --git a/dace/codegen/instrumentation/__init__.py b/dace/codegen/instrumentation/__init__.py
index d357e1a5a3..5ebab3f497 100644
--- a/dace/codegen/instrumentation/__init__.py
+++ b/dace/codegen/instrumentation/__init__.py
@@ -7,5 +7,6 @@
 from .timer import TimerProvider
 from .gpu_events import GPUEventProvider
 from .fpga import FPGAInstrumentationProvider
+from .gpu_tx_markers import GPUTXMarkersProvider
 
 from .data.data_dump import SaveProvider, RestoreProvider
diff --git a/dace/codegen/instrumentation/gpu_tx_markers.py b/dace/codegen/instrumentation/gpu_tx_markers.py
new file mode 100644
index 0000000000..be94e425fe
--- /dev/null
+++ b/dace/codegen/instrumentation/gpu_tx_markers.py
@@ -0,0 +1,255 @@
+# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
+import os
+from typing import Union
+
+from dace import dtypes, registry
+from dace.codegen import common
+from dace.codegen.prettycode import CodeIOStream
+from dace.codegen.instrumentation.provider import InstrumentationProvider
+from dace.memlet import Memlet
+from dace.sdfg import nodes, SDFG
+from dace.sdfg.graph import MultiConnectorEdge
+from dace.sdfg.nodes import NestedSDFG
+from dace.sdfg.scope import is_devicelevel_gpu_kernel
+from dace.sdfg.sdfg import SDFG
+from dace.sdfg.state import ControlFlowRegion, SDFGState
+
+
+@registry.autoregister_params(type=dtypes.InstrumentationType.GPU_TX_MARKERS)
+class GPUTXMarkersProvider(InstrumentationProvider):
+    """ Timing instrumentation that adds NVTX/rocTX ranges to SDFGs and states. """
+    NVTX_HEADER_INCLUDE = '#include <nvtx3/nvToolsExt.h>'
+    ROCTX_HEADER_INCLUDE = '#include <roctx.h>'
+
+    def __init__(self):
+        self.backend = common.get_gpu_backend()
+        # Check if ROCm TX libraries and headers are available
+        rocm_path = os.getenv('ROCM_PATH', '/opt/rocm')
+        roctx_header_paths = [
+            os.path.join(rocm_path, 'roctracer/include/roctx.h'),
+            os.path.join(rocm_path, 'include/roctracer/roctx.h')
+        ]
+        roctx_library_path = os.path.join(rocm_path, 'lib', 'libroctx64.so')
+        self.enable_rocTX = any(os.path.isfile(path)
+                                for path in roctx_header_paths) and os.path.isfile(roctx_library_path)
+        self.include_generated = False
+        super().__init__()
+
+    def _print_include(self, sdfg: SDFG) -> None:
+        """ Prints the include statement for the NVTX/rocTX library for a given SDFG. """
+        if self.include_generated:
+            return
+        if self.backend == 'cuda':
+            sdfg.append_global_code(self.NVTX_HEADER_INCLUDE, 'frame')
+        elif self.backend == 'hip':
+            if self.enable_rocTX:
+                sdfg.append_global_code(self.ROCTX_HEADER_INCLUDE, 'frame')
+        else:
+            raise NameError('GPU backend "%s" not recognized' % self.backend)
+        self.include_generated = True
+
+    def print_include(self, stream: CodeIOStream) -> None:
+        """ Prints the include statement for the NVTX/rocTX library in stream. """
+        if stream is None:
+            return
+        if self.backend == 'cuda':
+            stream.write(self.NVTX_HEADER_INCLUDE)
+        elif self.backend == 'hip':
+            if self.enable_rocTX:
+                stream.write(self.ROCTX_HEADER_INCLUDE)
+        else:
+            raise NameError('GPU backend "%s" not recognized' % self.backend)
+
+    def print_range_push(self, name: str, sdfg: SDFG, stream: CodeIOStream) -> None:
+        if stream is None:
+            return
+        self._print_include(sdfg)
+        if name is None:
+            name = 'None'
+        if self.backend == 'cuda':
+            stream.write(f'nvtxRangePush("{name}");')
+        elif self.backend == 'hip':
+            if self.enable_rocTX:
+                stream.write(f'roctxRangePush("{name}");')
+        else:
+            raise NameError(f'GPU backend "{self.backend}" not recognized')
+
+    def print_range_pop(self, stream: CodeIOStream) -> None:
+        if stream is None:
+            return
+        if self.backend == 'cuda':
+            stream.write('nvtxRangePop();')
+        elif self.backend == 'hip':
+            if self.enable_rocTX:
+                stream.write('roctxRangePop();')
+        else:
+            raise NameError(f'GPU backend "{self.backend}" not recognized')
+
+    def _is_sdfg_in_device_code(self, sdfg: SDFG) -> bool:
+        """ Check if the SDFG is in device code and not top level SDFG. """
+        sdfg_parent_state = sdfg.parent
+        while sdfg_parent_state is not None:
+            sdfg_parent_node = sdfg.parent_nsdfg_node
+            if is_devicelevel_gpu_kernel(sdfg, sdfg_parent_state, sdfg_parent_node):
+                return True
+            sdfg_parent_state = sdfg_parent_state.sdfg.parent
+        return False
+
+    def on_sdfg_begin(self, sdfg: SDFG, local_stream: CodeIOStream, global_stream: CodeIOStream, codegen) -> None:
+        if sdfg.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
+            return
+        if self._is_sdfg_in_device_code(sdfg):
+            # Don't instrument device code
+            return
+        self.print_range_push(f'sdfg_{sdfg.name}', sdfg, local_stream)
+
+    def on_sdfg_end(self, sdfg: SDFG, local_stream: CodeIOStream, global_stream: CodeIOStream) -> None:
+        if sdfg.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
+            return
+        if self._is_sdfg_in_device_code(sdfg):
+            # Don't instrument device code
+            return
+        self.print_range_pop(local_stream)
+
+    def on_state_begin(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, local_stream: CodeIOStream,
+                       global_stream: CodeIOStream) -> None:
+        if state.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
+            return
+        if self._is_sdfg_in_device_code(sdfg):
+            # Don't instrument device code
+            return
+        self.print_range_push(f'state_{state.label}', sdfg, local_stream)
+
+    def on_state_end(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, local_stream: CodeIOStream,
+                     global_stream: CodeIOStream) -> None:
+        if state.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
+            return
+        if self._is_sdfg_in_device_code(sdfg):
+            # Don't instrument device code
+            return
+        self.print_range_pop(local_stream)
+
+    def on_copy_begin(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, src_node: nodes.Node,
+                      dst_node: nodes.Node, edge: MultiConnectorEdge[Memlet], local_stream: CodeIOStream,
+                      global_stream: CodeIOStream, copy_shape, src_strides, dst_strides) -> None:
+        if state.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
+            return
+        if is_devicelevel_gpu_kernel(sdfg, state, src_node) or is_devicelevel_gpu_kernel(sdfg, state, dst_node):
+            # Don't instrument device code
+            return
+        self.print_range_push(f'copy_{src_node.label}_to_{dst_node.label}', sdfg, local_stream)
+
+    def on_copy_end(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, src_node: nodes.Node,
+                    dst_node: nodes.Node, edge: MultiConnectorEdge[Memlet], local_stream: CodeIOStream,
+                    global_stream: CodeIOStream) -> None:
+        if state.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
+            return
+        if is_devicelevel_gpu_kernel(sdfg, state, src_node) or is_devicelevel_gpu_kernel(sdfg, state, dst_node):
+            # Don't instrument device code
+            return
+        self.print_range_pop(local_stream)
+
+    def on_scope_entry(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node: nodes.EntryNode,
+                       outer_stream: CodeIOStream, inner_stream: CodeIOStream, global_stream: CodeIOStream) -> None:
+        if node.map.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
+            return
+        if is_devicelevel_gpu_kernel(sdfg, state, node):
+            # Don't instrument device code
+            return
+        self.print_range_push(f'scope_{node.label}', sdfg, outer_stream)
+
+    def on_scope_exit(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node: nodes.ExitNode,
+                      outer_stream: CodeIOStream, inner_stream: CodeIOStream, global_stream: CodeIOStream) -> None:
+        entry_node = state.entry_node(node)
+        if entry_node.map.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
+            return
+        if is_devicelevel_gpu_kernel(sdfg, state, entry_node):
+            # Don't instrument device code
+            return
+        self.print_range_pop(outer_stream)
+
+    def on_sdfg_init_begin(self, sdfg: SDFG, callsite_stream: CodeIOStream, global_stream: CodeIOStream) -> None:
+        if sdfg.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
+            return
+        if self._is_sdfg_in_device_code(sdfg):
+            # Don't instrument device code
+            return
+        # cannot push rocTX markers before initializing HIP
+        if self.enable_rocTX:
+            return
+        self.print_range_push(f'init_{sdfg.name}', sdfg, callsite_stream)
+
+    def on_sdfg_init_end(self, sdfg: SDFG, callsite_stream: CodeIOStream, global_stream: CodeIOStream) -> None:
+        if sdfg.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
+            return
+        if self._is_sdfg_in_device_code(sdfg):
+            # Don't instrument device code
+            return
+        # cannot push rocTX markers before initializing HIP so there's no marker to pop
+        if self.enable_rocTX:
+            return
+        self.print_range_pop(callsite_stream)
+
+    def on_sdfg_exit_begin(self, sdfg: SDFG, callsite_stream: CodeIOStream, global_stream: CodeIOStream) -> None:
+        if sdfg.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
+            return
+        if self._is_sdfg_in_device_code(sdfg):
+            # Don't instrument device code
+            return
+        self.print_range_push(f'exit_{sdfg.name}', sdfg, callsite_stream)
+
+    def on_sdfg_exit_end(self, sdfg: SDFG, callsite_stream: CodeIOStream, global_stream: CodeIOStream) -> None:
+        if sdfg.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
+            return
+        if self._is_sdfg_in_device_code(sdfg):
+            # Don't instrument device code
+            return
+        self.print_range_pop(callsite_stream)
+
+    def on_allocation_begin(self, sdfg: SDFG, scope: Union[nodes.EntryNode, SDFGState, SDFG],
+                            stream: CodeIOStream) -> None:
+        if sdfg.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
+            return
+        # We only want to instrument allocations at the SDFG or state level
+        if not isinstance(scope, (SDFGState, SDFG)):
+            return
+        if self._is_sdfg_in_device_code(sdfg):
+            # Don't instrument device code
+            return
+        self.print_range_push(f'alloc_{sdfg.name}', sdfg, stream)
+
+    def on_allocation_end(self, sdfg: SDFG, scope: Union[nodes.EntryNode, SDFGState, SDFG],
+                          stream: CodeIOStream) -> None:
+        if sdfg.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
+            return
+        # We only want to instrument allocations at the SDFG or state level
+        if not isinstance(scope, (SDFGState, SDFG)):
+            return
+        if self._is_sdfg_in_device_code(sdfg):
+            # Don't instrument device code
+            return
+        self.print_range_pop(stream)
+
+    def on_deallocation_begin(self, sdfg: SDFG, scope: Union[nodes.EntryNode, SDFGState, SDFG],
+                              stream: CodeIOStream) -> None:
+        if sdfg.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
+            return
+        # We only want to instrument allocations at the SDFG or state level
+        if not isinstance(scope, (SDFGState, SDFG)):
+            return
+        if self._is_sdfg_in_device_code(sdfg):
+            # Don't instrument device code
+            return
+        self.print_range_push(f'dealloc_{sdfg.name}', sdfg, stream)
+
+    def on_deallocation_end(self, sdfg: SDFG, scope: Union[nodes.EntryNode, SDFGState, SDFG],
+                            stream: CodeIOStream) -> None:
+        if sdfg.instrument != dtypes.InstrumentationType.GPU_TX_MARKERS:
+            return
+        # We only want to instrument allocations at the SDFG or state level
+        if not isinstance(scope, (SDFGState, SDFG)):
+            return
+        if self._is_sdfg_in_device_code(sdfg):
+            # Don't instrument device code
+            return
+        self.print_range_pop(stream)
diff --git a/dace/codegen/instrumentation/provider.py b/dace/codegen/instrumentation/provider.py
index a95c0495ba..dc643df4ca 100644
--- a/dace/codegen/instrumentation/provider.py
+++ b/dace/codegen/instrumentation/provider.py
@@ -183,3 +183,75 @@ def on_node_end(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node
             :param global_stream: Code generator for global (external) code.
         """
         pass
+
+    def on_sdfg_init_begin(self, sdfg: SDFG, local_stream: CodeIOStream, global_stream: CodeIOStream) -> None:
+        """ Event called at the beginning of SDFG initialization code generation.
+
+            :param sdfg: The generated SDFG object.
+            :param local_stream: Code generator for the in-function code.
+            :param global_stream: Code generator for global (external) code.
+        """
+        pass
+
+    def on_sdfg_init_end(self, sdfg: SDFG, local_stream: CodeIOStream, global_stream: CodeIOStream) -> None:
+        """ Event called at the end of SDFG initialization code generation.
+
+            :param sdfg: The generated SDFG object.
+            :param local_stream: Code generator for the in-function code.
+            :param global_stream: Code generator for global (external) code.
+        """
+        pass
+
+    def on_sdfg_exit_begin(self, sdfg: SDFG, local_stream: CodeIOStream, global_stream: CodeIOStream) -> None:
+        """ Event called at the beginning of SDFG exit code generation.
+
+            :param sdfg: The generated SDFG object.
+            :param local_stream: Code generator for the in-function code.
+            :param global_stream: Code generator for global (external) code.
+        """
+        pass
+
+    def on_sdfg_exit_end(self, sdfg: SDFG, local_stream: CodeIOStream, global_stream: CodeIOStream) -> None:
+        """ Event called at the end of SDFG exit code generation.
+
+            :param sdfg: The generated SDFG object.
+            :param local_stream: Code generator for the in-function code.
+            :param global_stream: Code generator for global (external) code.
+        """
+        pass
+
+    def on_allocation_begin(self, sdfg: SDFG, scope: Union[nodes.EntryNode, SDFGState, SDFG],
+                            stream: CodeIOStream) -> None:
+        """ Event called at the beginning of an allocation code generation.
+
+            :param sdfg: The generated SDFG object.
+            :param stream: Code generator.
+        """
+        pass
+
+    def on_allocation_end(self, sdfg: SDFG, scope: Union[nodes.EntryNode, SDFGState, SDFG],
+                          lstream: CodeIOStream) -> None:
+        """ Event called at the end of an allocation code generation.
+
+            :param sdfg: The generated SDFG object.
+            :param local_stream: Code generator.
+        """
+        pass
+
+    def on_deallocation_begin(self, sdfg: SDFG, scope: Union[nodes.EntryNode, SDFGState, SDFG],
+                              stream: CodeIOStream) -> None:
+        """ Event called at the beginning of a deallocation code generation.
+
+            :param sdfg: The generated SDFG object.
+            :param local_stream: Code generator.
+        """
+        pass
+
+    def on_deallocation_end(self, sdfg: SDFG, scope: Union[nodes.EntryNode, SDFGState, SDFG],
+                            lstream: CodeIOStream) -> None:
+        """ Event called at the end of a deallocation code generation.
+
+            :param sdfg: The generated SDFG object.
+            :param local_stream: Code generator.
+        """
+        pass
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index fe277538d3..127cf3fa4d 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1088,6 +1088,11 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St
                     is_c_order = is_fortran_order
                     dims = 1
 
+            for instr in self._dispatcher.instrumentation.values():
+                if instr is not None:
+                    instr.on_copy_begin(sdfg, cfg, state_dfg, src_node, dst_node, edge, callsite_stream, None,
+                                        copy_shape, src_strides, dst_strides)
+
             if dims > 2:
                 # Currently we only support ND copies when they can be represented
                 #  as a 1D copy or as a 2D strided copy
@@ -1243,6 +1248,10 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St
 
             self._emit_sync(callsite_stream)
 
+            for instr in self._dispatcher.instrumentation.values():
+                if instr is not None:
+                    instr.on_copy_end(sdfg, cfg, state_dfg, src_node, dst_node, edge, callsite_stream, None)
+
         # Copy within the GPU
         elif (src_storage in gpu_storage_types and dst_storage in gpu_storage_types):
 
diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index 449e312efa..aecd78a092 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -14,6 +14,7 @@
 from dace.codegen import dispatcher as disp
 from dace.codegen.prettycode import CodeIOStream
 from dace.codegen.common import codeblock_to_cpp, sym2cpp
+from dace.codegen.instrumentation.gpu_tx_markers import GPUTXMarkersProvider
 from dace.codegen.targets.target import TargetCodeGenerator
 from dace.codegen.tools.type_inference import infer_expr_type
 from dace.sdfg import SDFG, SDFGState, nodes
@@ -254,6 +255,13 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre
         # Write closing brace of program
         callsite_stream.write('}', sdfg)
 
+        if sdfg.instrument == dtypes.InstrumentationType.GPU_TX_MARKERS:
+            # Need to make sure that the necessary includes for GPU_TX_MARKERS are present
+            # in the generated code.
+            gpu_tx_markers_provider = self._dispatcher.instrumentation.get(dtypes.InstrumentationType.GPU_TX_MARKERS)
+            if gpu_tx_markers_provider:
+                gpu_tx_markers_provider.print_include(callsite_stream)
+
         # Write awkward footer to avoid 'extern "C"' issues
         params_comma = (', ' + params) if params else ''
         initparams_comma = (', ' + initparams) if initparams else ''
@@ -279,11 +287,17 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre
         callsite_stream.write(
             f"""
 DACE_EXPORTED {mangle_dace_state_struct_name(sdfg)} *__dace_init_{sdfg.name}({initparams})
-{{
-    int __result = 0;
-    {mangle_dace_state_struct_name(sdfg)} *__state = new {mangle_dace_state_struct_name(sdfg)};
+{{""", sdfg)
 
-            """, sdfg)
+        # Invoke all instrumentation providers
+        for instr in self._dispatcher.instrumentation.values():
+            if instr is not None:
+                instr.on_sdfg_init_begin(sdfg, callsite_stream, global_stream)
+
+        callsite_stream.write(
+            f"""
+    int __result = 0;
+    {mangle_dace_state_struct_name(sdfg)} *__state = new {mangle_dace_state_struct_name(sdfg)};""", sdfg)
 
         for target in self._dispatcher.used_targets:
             if target.has_initializer:
@@ -304,17 +318,29 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre
 
         callsite_stream.write(self._initcode.getvalue(), sdfg)
 
-        callsite_stream.write(
-            f"""
+        callsite_stream.write(f"""
     if (__result) {{
         delete __state;
         return nullptr;
     }}
+""", sdfg)
+        # Invoke all instrumentation providers
+        for instr in self._dispatcher.instrumentation.values():
+            if instr is not None:
+                instr.on_sdfg_init_end(sdfg, callsite_stream, global_stream)
+        callsite_stream.write(
+            f"""
     return __state;
 }}
 
 DACE_EXPORTED int __dace_exit_{sdfg.name}({mangle_dace_state_struct_name(sdfg)} *__state)
 {{
+""", sdfg)
+        # Invoke all instrumentation providers
+        for instr in self._dispatcher.instrumentation.values():
+            if instr is not None:
+                instr.on_sdfg_exit_begin(sdfg, callsite_stream, global_stream)
+        callsite_stream.write(f"""
     int __err = 0;
 """, sdfg)
 
@@ -349,6 +375,10 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre
                 callsite_stream.write("}")
 
         callsite_stream.write('delete __state;\n', sdfg)
+        # Invoke all instrumentation providers
+        for instr in self._dispatcher.instrumentation.values():
+            if instr is not None:
+                instr.on_sdfg_exit_end(sdfg, callsite_stream, global_stream)
         callsite_stream.write('return __err;\n}\n', sdfg)
 
     def generate_external_memory_management(self, sdfg: SDFG, callsite_stream: CodeIOStream):
@@ -798,6 +828,11 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG):
     def allocate_arrays_in_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, scope: Union[nodes.EntryNode, SDFGState,
                                                                                         SDFG],
                                  function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None:
+        if len(self.to_allocate[scope]) == 0:
+            return
+        for instr in self._dispatcher.instrumentation.values():
+            if instr is not None:
+                instr.on_allocation_begin(sdfg, scope, callsite_stream)
         """ Dispatches allocation of all arrays in the given scope. """
         for tsdfg, state, node, declare, allocate, _ in self.to_allocate[scope]:
             if state is not None:
@@ -809,10 +844,18 @@ def allocate_arrays_in_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, scope: Un
 
             self._dispatcher.dispatch_allocate(tsdfg, cfg if state is None else state.parent_graph, state, state_id,
                                                node, desc, function_stream, callsite_stream, declare, allocate)
+        for instr in self._dispatcher.instrumentation.values():
+            if instr is not None:
+                instr.on_allocation_end(sdfg, scope, callsite_stream)
 
     def deallocate_arrays_in_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, scope: Union[nodes.EntryNode, SDFGState,
                                                                                           SDFG],
                                    function_stream: CodeIOStream, callsite_stream: CodeIOStream):
+        if len(self.to_allocate[scope]) == 0:
+            return
+        for instr in self._dispatcher.instrumentation.values():
+            if instr is not None:
+                instr.on_deallocation_begin(sdfg, scope, callsite_stream)
         """ Dispatches deallocation of all arrays in the given scope. """
         for tsdfg, state, node, _, _, deallocate in self.to_allocate[scope]:
             if not deallocate:
@@ -826,6 +869,9 @@ def deallocate_arrays_in_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, scope:
 
             self._dispatcher.dispatch_deallocate(tsdfg, state.parent_graph, state, state_id, node, desc,
                                                  function_stream, callsite_stream)
+        for instr in self._dispatcher.instrumentation.values():
+            if instr is not None:
+                instr.on_deallocation_end(sdfg, scope, callsite_stream)
 
     def generate_code(self,
                       sdfg: SDFG,
diff --git a/dace/dtypes.py b/dace/dtypes.py
index faadc84a50..28372087e9 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -167,6 +167,7 @@ class InstrumentationType(aenum.AutoNumberEnum):
     LIKWID_GPU = ()
     GPU_Events = ()
     FPGA = ()
+    GPU_TX_MARKERS = ()
 
 
 @undefined_safe_enum
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 44a085603d..def7d5dce6 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -1030,8 +1030,12 @@ def get_latest_report_path(self) -> Optional[str]:
         :return: A path to the latest instrumentation report, or None if one does not exist.
         """
         path = os.path.join(self.build_folder, 'perf')
-        files = [f for f in os.listdir(path) if f.startswith('report-')]
-        if len(files) == 0:
+        try:
+            files = [f for f in os.listdir(path) if f.startswith('report-')]
+        except FileNotFoundError:
+            return None
+
+        if not files:
             return None
 
         return os.path.join(path, sorted(files, reverse=True)[0])
diff --git a/dace/version.py b/dace/version.py
index 1f356cc57b..a71a5118e7 100644
--- a/dace/version.py
+++ b/dace/version.py
@@ -1 +1 @@
-__version__ = '1.0.0'
+__version__ = '2025.11.05'
diff --git a/doc/optimization/profiling.rst b/doc/optimization/profiling.rst
index 87539e87a8..3f53d4e324 100644
--- a/doc/optimization/profiling.rst
+++ b/doc/optimization/profiling.rst
@@ -121,7 +121,8 @@ Instrumentation can also collect performance counters on CPUs and GPUs using `LI
 The :class:`~dace.dtypes.InstrumentationType.LIKWID_Counters` instrumentation type can be configured to collect
 a wide variety of performance counters on CPUs and GPUs. An example use can be found in the
 `LIKWID instrumentation code sample <https://github.com/spcl/dace/blob/main/samples/instrumentation/matmul_likwid.py>`_.
-
+There is also the :class:`~dace.dtypes.InstrumentationType.GPU_TX_MARKERS` instrumentation type which wraps in NVTX or rocTX markers the DaCe program executed on the GPU. Important parts of the execution of the program on the GPU as the different states, SDFGs and initialization and finalization phases are marked with these markers.
+These markers can be used to visualize and measure the GPU activity using the NVIDIA Nsight Systems or ROCm Systems profilers.
 
 Instrumentation file format
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/dace.codegen.instrumentation.rst b/doc/source/dace.codegen.instrumentation.rst
index d476090d6a..0fc5941097 100644
--- a/doc/source/dace.codegen.instrumentation.rst
+++ b/doc/source/dace.codegen.instrumentation.rst
@@ -4,6 +4,14 @@ dace.codegen.instrumentation package
 Submodules
 ----------
 
+dace.codegen.instrumentation.gpu_tx_markers module
+-----------------------------------------------
+
+.. automodule:: dace.codegen.instrumentation.gpu_tx_markers
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 dace.codegen.instrumentation.fpga module
 ----------------------------------------
 
diff --git a/tests/codegen/external_memory_test.py b/tests/codegen/external_memory_test.py
index 169e050914..47eac55ff3 100644
--- a/tests/codegen/external_memory_test.py
+++ b/tests/codegen/external_memory_test.py
@@ -30,7 +30,7 @@ def tester(a: dace.float64[N]):
     a = np.random.rand(20)
 
     if symbolic:
-        extra_args = dict(a=a, N=20)
+        extra_args = dict(N=20)
     else:
         extra_args = {}
 
diff --git a/tests/instrumentation_test.py b/tests/instrumentation_test.py
index 2aa26edf36..120e025812 100644
--- a/tests/instrumentation_test.py
+++ b/tests/instrumentation_test.py
@@ -4,6 +4,7 @@
 
 import pytest
 import numpy as np
+import re
 import sys
 
 import dace
@@ -39,14 +40,17 @@ def onetest(instrumentation: dace.InstrumentationType, size=128):
         if isinstance(node, nodes.MapEntry) and node.map.label == 'mult':
             node.map.instrument = instrumentation
             state.instrument = instrumentation
-    # Set Timer instrumentation on the whole SDFG
-    if instrumentation == dace.InstrumentationType.Timer:
-        sdfg.instrument = instrumentation
 
-    if instrumentation == dace.InstrumentationType.GPU_Events:
+    if instrumentation in [dace.InstrumentationType.GPU_Events, dace.InstrumentationType.GPU_TX_MARKERS]:
         sdfg.apply_transformations(GPUTransformSDFG)
 
-    sdfg(A=A, B=B, C=C, N=size)
+    with dace.instrument(instrumentation,
+                         filter='*',
+                         annotate_maps=True,
+                         annotate_tasklets=False,
+                         annotate_states=True,
+                         annotate_sdfgs=True):
+        sdfg(A=A, B=B, C=C, N=size)
 
     # Check for correctness
     assert np.allclose(C, 20 * A @ B)
@@ -57,6 +61,22 @@ def onetest(instrumentation: dace.InstrumentationType, size=128):
         report = sdfg.get_latest_report()
         print(report)
 
+    # Check that the NVTX/rocTX range wrapper is present in the generated CPU code
+    if instrumentation == dace.InstrumentationType.GPU_TX_MARKERS:
+        code = sdfg.generate_code()[0].clean_code
+        tx_include = re.search(r'#include <(nvtx3/nvToolsExt|roctx).h>', code)
+        assert tx_include is not None
+        range_push = re.search(r'(nvtx|roctx)RangePush\("sdfg', code) is not None
+        range_push &= re.search(r'(nvtx|roctx)RangePush\("copy', code) is not None
+        range_push &= re.search(r'(nvtx|roctx)RangePush\("state', code) is not None
+        range_push &= re.search(r'(nvtx|roctx)RangePush\("alloc', code) is not None
+        range_push &= re.search(r'(nvtx|roctx)RangePush\("dealloc', code) is not None
+        range_push &= re.search(r'(nvtx|roctx)RangePush\("init', code) is not None
+        range_push &= re.search(r'(nvtx|roctx)RangePush\("exit', code) is not None
+        assert range_push
+        range_pop = re.search(r'(nvtx|roctx)RangePop\b', code)
+        assert range_pop is not None
+
 
 def test_timer():
     onetest(dace.InstrumentationType.Timer)
@@ -73,8 +93,14 @@ def test_gpu_events():
     onetest(dace.InstrumentationType.GPU_Events)
 
 
+@pytest.mark.gpu
+def test_gpu_tx_markers():
+    onetest(dace.InstrumentationType.GPU_TX_MARKERS)
+
+
 if __name__ == '__main__':
     test_timer()
     test_papi()
     if len(sys.argv) > 1 and sys.argv[1] == 'gpu':
         test_gpu_events()
+        test_gpu_tx_markers()
diff --git a/tests/python_frontend/return_value_test.py b/tests/python_frontend/return_value_test.py
index 4a845bea0b..4e704287bc 100644
--- a/tests/python_frontend/return_value_test.py
+++ b/tests/python_frontend/return_value_test.py
@@ -9,7 +9,15 @@ def test_return_scalar():
     def return_scalar():
         return 5
 
-    assert return_scalar() == 5
+    res = return_scalar()
+    assert res == 5
+
+    # Don't be fooled by the test above the return value is an array. If you would
+    #  add the return value annotation to the program, i.e. `-> dace.int32` you would
+    #  get a validation error.
+    assert isinstance(res, np.ndarray)
+    assert res.shape == (1, )
+    assert res.dtype == np.int64
 
 
 def test_return_scalar_in_nested_function():
@@ -22,7 +30,15 @@ def nested_function() -> dace.int32:
     def return_scalar():
         return nested_function()
 
-    assert return_scalar() == 5
+    res = return_scalar()
+    assert res == 5
+
+    # Don't be fooled by the test above the return value is an array. If you would
+    #  add the return value annotation to the program, i.e. `-> dace.int32` you would
+    #  get a validation error.
+    assert isinstance(res, np.ndarray)
+    assert res.shape == (1, )
+    assert res.dtype == np.int32
 
 
 def test_return_array():
@@ -42,6 +58,8 @@ def return_tuple():
         return 5, 6
 
     res = return_tuple()
+    assert isinstance(res, tuple)
+    assert len(res) == 2
     assert res == (5, 6)
 
 
@@ -52,6 +70,8 @@ def return_array_tuple():
         return 5 * np.ones(5), 6 * np.ones(6)
 
     res = return_array_tuple()
+    assert isinstance(res, tuple)
+    assert len(res) == 2
     assert np.allclose(res[0], 5 * np.ones(5))
     assert np.allclose(res[1], 6 * np.ones(6))
 
@@ -66,10 +86,25 @@ def return_void(a: dace.float64[20]):
 
     a = np.random.rand(20)
     ref = a + 1
-    return_void(a)
+    res = return_void(a)
+    assert res is None
     assert np.allclose(a, ref)
 
 
+def test_return_tuple_1_element():
+
+    @dace.program
+    def return_one_element_tuple(a: dace.float64[20]):
+        return (a + 3.5, )
+
+    a = np.random.rand(20)
+    ref = a + 3.5
+    res = return_one_element_tuple(a)
+    assert isinstance(res, tuple)
+    assert len(res) == 1
+    assert np.allclose(res[0], ref)
+
+
 def test_return_void_in_if():
 
     @dace.program