ROCm · ipanfilo · Feb 22, 2026
@@ -502,30 +502,17 @@ def uninstall_te_wheel_packages():
         ]
     )
 
-def detect_hipify_v2():
-    try:
-        from torch.utils.hipify import __version__
-        from packaging.version import Version
-        if Version(__version__) >= Version("2.0.0"):
-            return True
-    except Exception as e:
-        print("failed to detect pytorch hipify version, defaulting to version 1.0.0 behavior")
-        print(e)
-    return False
-
 def hipify(base_dir, src_dir, sources, include_dirs):
     cwd = os.getcwd()
-    if detect_hipify_v2():
-        hipify_module = importlib.import_module("3rdparty.hipify_torch.hipify_torch.v2.hipify_python")
-    else:
-        hipify_module = importlib.import_module("3rdparty.hipify_torch.hipify_torch.hipify_python")
+    hipify_module = importlib.import_module("3rdparty.hipify_torch.hipify_torch.v2.hipify_python")
     do_hipify = hipify_module.hipify
+    print(f"Run hipify on {src_dir}")
 
     hipify_result = do_hipify(
         project_directory=src_dir,
         output_directory=src_dir,
-        includes=["*"],
-        ignores=["*/amd_detail/*", "*/aotriton/*", "*/ck_fused_attn/*"],
+        includes=["*/common/*", str(src_dir)+"/*"],
+        ignores=["*/amd_detail/*", "*/aotriton/*", "*/ck_fused_attn/*", "*/rocshmem_api/*"],
         header_include_dirs=[d for d in include_dirs if Path(d).is_relative_to(base_dir)],
         custom_map_list=base_dir / "hipify_custom_map.json",
         extra_files=[],

@@ -1,8 +1,8 @@
 {
  "custom_map" : {
         "<cuda_bf16.h>" : "<hip/hip_bfloat16.h>",
-        "<cuda_fp8.h>" : "\"amd_detail/hip_float8.h\"",
-        "util/cuda_runtime.h" : "util/hip_runtime.h",
+        "<cuda_fp8.h>" : "\"common/amd_detail/hip_float8.h\"",
+        "cuda_runtime.h\"" : "hip_runtime.h\"",
         "ATen/cudnn/Handle.h" : "ATen/miopen/Handle.h",
         "CUfunc_cache" : "hipFuncCache_t", 
         "<nvtx3/nvToolsExt.h>" : "<roctracer/roctx.h>",

@@ -49,7 +49,10 @@ def run(self):
         if rocm_build():
             print("Running hipification of installable headers for ROCm build...")
             common_headers_dir = current_file_path / "transformer_engine/common/include"
-            hipify(current_file_path, common_headers_dir, all_files_in_dir(common_headers_dir), [])
+            #TODO: some installable headers refer non installable headers (i.e not from common/include)
+            #so we need add extra include paths here to match hipification results with build process
+            hipify(current_file_path, common_headers_dir, all_files_in_dir(common_headers_dir),
+                   [common_headers_dir, current_file_path / "transformer_engine"])
         super().run()
 
 CMakeBuildExtension = get_build_ext(BuildExtension)

@@ -1,5 +1,5 @@
 # This file was modified for portability to AMDGPU
-# Copyright (c) 2022-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2022-2026, Advanced Micro Devices, Inc. All rights reserved.
 # Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
@@ -96,5 +96,34 @@ else()
   find_package(hip REQUIRED)
 endif()
 
+if(USE_ROCM)
+  set(TE ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+  set(THIRDPARTY ${TE}/3rdparty)
+  list(APPEND CMAKE_MODULE_PATH "${THIRDPARTY}/hipify_torch/cmake")
+  include(Hipify)
+  message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}")
+
+  #hipify parameters should match those used in TE CMakeLists.txt
+  set(header_include_dir
+    ${TE}/transformer_engine/common/include
+    ${TE}/transformer_engine/common
+    ${TE}/transformer_engine)
+
+  set(cuda_source_dir ${PROJECT_SOURCE_DIR} )
+  message(STATUS "Run hipify on ${cuda_source_dir}")
+  hipify(CUDA_SOURCE_DIR ${cuda_source_dir} 
+    HEADER_INCLUDE_DIR ${header_include_dir}
+    IGNORES "*/pytorch/csrc/*"
+    IGNORES "*/jax/csrc/*"
+    IGNORES "*/amd_detail/*"
+    IGNORES "*/aotriton/*"
+    IGNORES "*/ck_fused_attn/*"
+    IGNORES "*/rocshmem_api/*"
+    CUSTOM_MAP_FILE "${TE}/hipify_custom_map.json"
+    NO_MATH_REPLACE
+    V2
+  )
+endif()
+
 add_subdirectory(operator)
 add_subdirectory(util)
@@ -41,31 +41,9 @@ endif()
 if(USE_CUDA)
   add_executable(test_operator ${test_cuda_sources})
 else()
-  message("${message_line}")
-  message(STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}")
-  message(STATUS "PROJECT_SOURCE_DIR: ${PROJECT_SOURCE_DIR}")
-
-  set(TE ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
-  set(THIRDPARTY ${TE}/3rdparty)
-  list(APPEND CMAKE_MODULE_PATH "${THIRDPARTY}/hipify_torch/cmake")
-  include(Hipify)
-  message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}")
-
-  file(REAL_PATH ../../../transformer_engine/common/include header_include_dir1)
-  file(REAL_PATH ../../../transformer_engine/common header_include_dir2)
-  set(header_include_dir ${header_include_dir1} ${header_include_dir2})
-
-  message(STATUS "CUDA_SOURCE_DIR: ${PROJECT_SOURCE_DIR}")
-  message(STATUS "HEADER_INCLUDE_DIR: ${header_include_dir}")
-  set(cuda_source_dir ${PROJECT_SOURCE_DIR} )
-  hipify(CUDA_SOURCE_DIR ${cuda_source_dir} 
-    HEADER_INCLUDE_DIR ${header_include_dir}
-    CUSTOM_MAP_FILE "${TE}/hipify_custom_map.json"
-    NO_MATH_REPLACE
-  )
   get_hipified_list("${test_cuda_sources}" test_hip_sources)
   message("${message_line}")
-  message(STATUS "nvte tests hipified sources: ${test_hip_sources}")
+  message(STATUS "test_operator hipified sources: ${test_hip_sources}")
 
   add_executable(test_operator ${test_hip_sources})
 endif()

@@ -4,16 +4,18 @@
 #
 # See LICENSE for license information.
 
-if(USE_CUDA)
-add_executable(test_util
+list(APPEND test_cuda_sources
                test_nvrtc.cpp
                test_string.cpp
                ../test_common.cu)
-else()
-add_executable(test_util
-               test_nvrtc_hip.cpp
-               test_string.cpp
-               ../test_common.hip)
+if(USE_CUDA)
+    add_executable(test_util ${test_cuda_sources})
+else()  
+    get_hipified_list("${test_cuda_sources}" test_hip_sources)
+    message("${message_line}")
+    message(STATUS "test_util hipified sources: ${test_hip_sources}")
+
+    add_executable(test_util ${test_hip_sources})
 endif()
 
 find_package(OpenMP REQUIRED)

@@ -1,5 +1,5 @@
 # This file was modified for portability to AMDGPU
-# Copyright (c) 2022-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2022-2026, Advanced Micro Devices, Inc. All rights reserved.
 # Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
@@ -211,13 +211,13 @@ else()
   include(Hipify)
   message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}")
 
+  # Parameters for hipify here should match hipify from build_tools/utils.py
+  # Because CMake hipify doesn't support INCLUDES we explicitly add pytorch and jax to IGNORES
   set(header_include_dir
       ${CMAKE_CURRENT_SOURCE_DIR}/..
-      ${CMAKE_CURRENT_SOURCE_DIR}/include 
-      ${CMAKE_CURRENT_SOURCE_DIR}/util
+      ${CMAKE_CURRENT_SOURCE_DIR}/include
       ${CMAKE_CURRENT_SOURCE_DIR})
-  message(STATUS "HIPIFY CUDA_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}")
-  message(STATUS "HIPIFY HEADER_INCLUDE_DIR: ${header_include_dir}")
+  message(STATUS "Run hipify on ${CMAKE_CURRENT_SOURCE_DIR}")
   hipify(CUDA_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}
       HEADER_INCLUDE_DIR ${header_include_dir}
       IGNORES "*/amd_detail/*"
@@ -228,13 +228,13 @@ else()
       IGNORES "*/rocshmem_api/*"
       CUSTOM_MAP_FILE "${TE}/hipify_custom_map.json"
       NO_MATH_REPLACE
+      V2
   )
   get_hipified_list("${transformer_engine_SOURCES}" te_hip_sources)
   message("${message_line}")
   message(STATUS "nvte hipified sources: ${te_hip_sources}")
 
   add_library(transformer_engine SHARED ${te_hip_sources})
-  target_include_directories(transformer_engine PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
 endif()
 
 target_include_directories(transformer_engine PUBLIC

@@ -1,4 +1,6 @@
 /*************************************************************************
+ * This file was modified for portability to AMDGPU
+ * Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE for license information.
@@ -11,7 +13,7 @@
 #ifndef TRANSFORMER_ENGINE_MULTI_STREAM_H
 #define TRANSFORMER_ENGINE_MULTI_STREAM_H
 
-#include "cuda_runtime.h"
+#include <cuda_runtime.h> //system CUDA header
 
 #ifdef __cplusplus
 extern "C" {

@@ -1,6 +1,4 @@
 /*************************************************************************
- * This file was modified for portability to AMDGPU
- * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE for license information.
@@ -11,13 +9,8 @@
 
 #include <vector>
 
-#ifndef __HIP_PLATFORM_AMD__
 #include "cuda_runtime.h"
 #include "logging.h"
-#else
-#include "util/cuda_runtime.h"
-#include "util/logging.h"
-#endif
 
 namespace transformer_engine::detail {
 

@@ -7,17 +7,16 @@
 #pragma once
 
 #include <cfloat>
-#include <cuda.h>
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 
-#include "common.h"
+#include "../common.h"
 #include "math.h"
 #include "ptx.cuh"
 #include "rocm_vectorized_2d.cuh"
 #include "transformer_engine/activation.h"
 #include "transformer_engine/cast.h"
 #include "vectorized_pointwise.h"
-#include "utils.cuh"
+#include "../utils.cuh"
 
 namespace transformer_engine {
 namespace gated_kernels {

@@ -6,25 +6,24 @@
 #pragma once
 
 #include <cfloat>
-#include <cuda.h>
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 
-#include "common.h"
+#include "../common.h"
 #include "math.h"
 #include "ptx.cuh"
 #include "rocm_vectorized_2d.cuh"
 #include "transformer_engine/cast.h"
-#include "transpose/cast_transpose.h"
+#include "../transpose/cast_transpose.h"
 #include "vectorized_pointwise.h"
-#include "utils.cuh"
+#include "../utils.cuh"
 
 namespace transformer_engine {
 
 // Forward declaration, definition is in cast_kernels.cuh
 template <bool IS_DBIAS, bool IS_DACT, bool IS_ACT, typename ParamOP,
           float (*OP)(float, const ParamOP &)>
 void mxfp8_quantize(const Tensor &input, const Tensor *act_input, const Tensor *noop,
-                    Tensor *output, Tensor *dbias, Tensor *workspace, cudaStream_t stream);
+                    Tensor *output, Tensor *dbias, Tensor *workspace, hipStream_t stream);
 
 
 constexpr size_t MXFP8_CHUNK_DIM_Y = 64;
@@ -401,15 +400,15 @@ __global__ void __launch_bounds__(MXFP8_THREADS_PER_CHUNK)
 // Forward declaration of functions defined in `cast_kernels.cuh`
 template <typename IType>
 void reduce_dbias(const float *workspace_ptr, Tensor *dbias, const size_t rows, const size_t cols,
-                  cudaStream_t stream);
+                  hipStream_t stream);
 
 template <typename ParamOP, float (*OP)(float, const ParamOP &)>
 void CastVectorizedUnaryKernelLauncher(const Tensor &input, const Tensor *noop, Tensor *output,
-                                       cudaStream_t stream);
+                                       hipStream_t stream);
 
 template <typename ParamOP, float (*OP)(float, const ParamOP &)>
 void CastVectorizedUnaryGradKernelLauncher(const Tensor &grad, const Tensor *input, Tensor *output,
-                                           cudaStream_t stream);
+                                           hipStream_t stream);
 
 constexpr size_t TILE_DIM = 32;
 template <typename DTypeReduce>
@@ -445,7 +444,7 @@ __global__ void partial_reduce_kernel(const DTypeReduce* input, float* partial_o
 
 template <typename DTypeReduce, typename DBiasTypeOut>
 void reduce_dbias_rocm(const DTypeReduce *workspace_ptr, Tensor *dbias, const size_t rows,
-                       const size_t cols, cudaStream_t stream, Tensor* partial_sum_workspace) {
+                       const size_t cols, hipStream_t stream, Tensor* partial_sum_workspace) {
   dim3 block_dim_partial(TILE_DIM, TILE_DIM);
   dim3 grid_dim_partial(DIVUP(cols, TILE_DIM), DIVUP(rows, TILE_DIM));
 
@@ -464,7 +463,7 @@ template <bool IS_DBIAS, bool IS_DACT, bool IS_ACT, typename ParamOP,
           float (*OP)(float, const ParamOP &)>
 void fp8_quantize_rocm(const Tensor &input, const Tensor *act_input, const Tensor *noop,
                              Tensor *output, Tensor *dbias, Tensor *workspace,
-                             cudaStream_t stream) {
+                             hipStream_t stream) {
   switch (output->scaling_mode) {
     case NVTE_DELAYED_TENSOR_SCALING: {
       const size_t rows = input.flat_first_dim();

@@ -7,19 +7,18 @@
 #pragma once
 
 #include <cfloat>
-#include <cuda.h>
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #include <limits>
 
-#include "common.h"
+#include "../common.h"
 #include "math.h"
 #include "ptx.cuh"
 #include "rocm_vectorized_2d.cuh"
 #include "transformer_engine/activation.h"
 #include "transformer_engine/cast.h"
-#include "transpose/cast_transpose.h"
+#include "../transpose/cast_transpose.h"
 #include "transformer_engine/transpose.h"
-#include "utils.cuh"
+#include "../utils.cuh"
 #include "vectorized_pointwise.h"
 
 namespace transformer_engine {