Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 3rdparty/hipify_torch
21 changes: 4 additions & 17 deletions build_tools/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,30 +502,17 @@ def uninstall_te_wheel_packages():
]
)

def detect_hipify_v2():
try:
from torch.utils.hipify import __version__
from packaging.version import Version
if Version(__version__) >= Version("2.0.0"):
return True
except Exception as e:
print("failed to detect pytorch hipify version, defaulting to version 1.0.0 behavior")
print(e)
return False

def hipify(base_dir, src_dir, sources, include_dirs):
cwd = os.getcwd()
if detect_hipify_v2():
hipify_module = importlib.import_module("3rdparty.hipify_torch.hipify_torch.v2.hipify_python")
else:
hipify_module = importlib.import_module("3rdparty.hipify_torch.hipify_torch.hipify_python")
hipify_module = importlib.import_module("3rdparty.hipify_torch.hipify_torch.v2.hipify_python")
do_hipify = hipify_module.hipify
print(f"Run hipify on {src_dir}")

hipify_result = do_hipify(
project_directory=src_dir,
output_directory=src_dir,
includes=["*"],
ignores=["*/amd_detail/*", "*/aotriton/*", "*/ck_fused_attn/*"],
includes=["*/common/*", str(src_dir)+"/*"],
ignores=["*/amd_detail/*", "*/aotriton/*", "*/ck_fused_attn/*", "*/rocshmem_api/*"],
header_include_dirs=[d for d in include_dirs if Path(d).is_relative_to(base_dir)],
custom_map_list=base_dir / "hipify_custom_map.json",
extra_files=[],
Expand Down
4 changes: 2 additions & 2 deletions hipify_custom_map.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"custom_map" : {
"<cuda_bf16.h>" : "<hip/hip_bfloat16.h>",
"<cuda_fp8.h>" : "\"amd_detail/hip_float8.h\"",
"util/cuda_runtime.h" : "util/hip_runtime.h",
"<cuda_fp8.h>" : "\"common/amd_detail/hip_float8.h\"",
"cuda_runtime.h\"" : "hip_runtime.h\"",
"ATen/cudnn/Handle.h" : "ATen/miopen/Handle.h",
"CUfunc_cache" : "hipFuncCache_t",
"<nvtx3/nvToolsExt.h>" : "<roctracer/roctx.h>",
Expand Down
5 changes: 4 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ def run(self):
if rocm_build():
print("Running hipification of installable headers for ROCm build...")
common_headers_dir = current_file_path / "transformer_engine/common/include"
hipify(current_file_path, common_headers_dir, all_files_in_dir(common_headers_dir), [])
#TODO: some installable headers refer non installable headers (i.e not from common/include)
#so we need add extra include paths here to match hipification results with build process
hipify(current_file_path, common_headers_dir, all_files_in_dir(common_headers_dir),
[common_headers_dir, current_file_path / "transformer_engine"])
super().run()

CMakeBuildExtension = get_build_ext(BuildExtension)
Expand Down
31 changes: 30 additions & 1 deletion tests/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# This file was modified for portability to AMDGPU
# Copyright (c) 2022-2025, Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2022-2026, Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
Expand Down Expand Up @@ -96,5 +96,34 @@ else()
find_package(hip REQUIRED)
endif()

if(USE_ROCM)
set(TE ${CMAKE_CURRENT_SOURCE_DIR}/../..)
set(THIRDPARTY ${TE}/3rdparty)
list(APPEND CMAKE_MODULE_PATH "${THIRDPARTY}/hipify_torch/cmake")
include(Hipify)
message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}")

#hipify parameters should match those used in TE CMakeLists.txt
set(header_include_dir
${TE}/transformer_engine/common/include
${TE}/transformer_engine/common
${TE}/transformer_engine)

set(cuda_source_dir ${PROJECT_SOURCE_DIR} )
message(STATUS "Run hipify on ${cuda_source_dir}")
hipify(CUDA_SOURCE_DIR ${cuda_source_dir}
HEADER_INCLUDE_DIR ${header_include_dir}
IGNORES "*/pytorch/csrc/*"
IGNORES "*/jax/csrc/*"
IGNORES "*/amd_detail/*"
IGNORES "*/aotriton/*"
IGNORES "*/ck_fused_attn/*"
IGNORES "*/rocshmem_api/*"
CUSTOM_MAP_FILE "${TE}/hipify_custom_map.json"
NO_MATH_REPLACE
V2
)
endif()

add_subdirectory(operator)
add_subdirectory(util)
24 changes: 1 addition & 23 deletions tests/cpp/operator/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,31 +41,9 @@ endif()
if(USE_CUDA)
add_executable(test_operator ${test_cuda_sources})
else()
message("${message_line}")
message(STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}")
message(STATUS "PROJECT_SOURCE_DIR: ${PROJECT_SOURCE_DIR}")

set(TE ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
set(THIRDPARTY ${TE}/3rdparty)
list(APPEND CMAKE_MODULE_PATH "${THIRDPARTY}/hipify_torch/cmake")
include(Hipify)
message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}")

file(REAL_PATH ../../../transformer_engine/common/include header_include_dir1)
file(REAL_PATH ../../../transformer_engine/common header_include_dir2)
set(header_include_dir ${header_include_dir1} ${header_include_dir2})

message(STATUS "CUDA_SOURCE_DIR: ${PROJECT_SOURCE_DIR}")
message(STATUS "HEADER_INCLUDE_DIR: ${header_include_dir}")
set(cuda_source_dir ${PROJECT_SOURCE_DIR} )
hipify(CUDA_SOURCE_DIR ${cuda_source_dir}
HEADER_INCLUDE_DIR ${header_include_dir}
CUSTOM_MAP_FILE "${TE}/hipify_custom_map.json"
NO_MATH_REPLACE
)
get_hipified_list("${test_cuda_sources}" test_hip_sources)
message("${message_line}")
message(STATUS "nvte tests hipified sources: ${test_hip_sources}")
message(STATUS "test_operator hipified sources: ${test_hip_sources}")

add_executable(test_operator ${test_hip_sources})
endif()
Expand Down
16 changes: 9 additions & 7 deletions tests/cpp/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,18 @@
#
# See LICENSE for license information.

if(USE_CUDA)
add_executable(test_util
list(APPEND test_cuda_sources
test_nvrtc.cpp
test_string.cpp
../test_common.cu)
else()
add_executable(test_util
test_nvrtc_hip.cpp
test_string.cpp
../test_common.hip)
if(USE_CUDA)
add_executable(test_util ${test_cuda_sources})
else()
get_hipified_list("${test_cuda_sources}" test_hip_sources)
message("${message_line}")
message(STATUS "test_util hipified sources: ${test_hip_sources}")

add_executable(test_util ${test_hip_sources})
endif()

find_package(OpenMP REQUIRED)
Expand Down
12 changes: 6 additions & 6 deletions transformer_engine/common/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# This file was modified for portability to AMDGPU
# Copyright (c) 2022-2025, Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2022-2026, Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
Expand Down Expand Up @@ -211,13 +211,13 @@ else()
include(Hipify)
message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}")

# Parameters for hipify here should match hipify from build_tools/utils.py
# Because CMake hipify doesn't support INCLUDES we explicitly add pytorch and jax to IGNORES
set(header_include_dir
${CMAKE_CURRENT_SOURCE_DIR}/..
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/util
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR})
message(STATUS "HIPIFY CUDA_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}")
message(STATUS "HIPIFY HEADER_INCLUDE_DIR: ${header_include_dir}")
message(STATUS "Run hipify on ${CMAKE_CURRENT_SOURCE_DIR}")
hipify(CUDA_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}
HEADER_INCLUDE_DIR ${header_include_dir}
IGNORES "*/amd_detail/*"
Expand All @@ -228,13 +228,13 @@ else()
IGNORES "*/rocshmem_api/*"
CUSTOM_MAP_FILE "${TE}/hipify_custom_map.json"
NO_MATH_REPLACE
V2
)
get_hipified_list("${transformer_engine_SOURCES}" te_hip_sources)
message("${message_line}")
message(STATUS "nvte hipified sources: ${te_hip_sources}")

add_library(transformer_engine SHARED ${te_hip_sources})
target_include_directories(transformer_engine PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
endif()

target_include_directories(transformer_engine PUBLIC
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
/*************************************************************************
* This file was modified for portability to AMDGPU
* Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See LICENSE for license information.
Expand All @@ -11,7 +13,7 @@
#ifndef TRANSFORMER_ENGINE_MULTI_STREAM_H
#define TRANSFORMER_ENGINE_MULTI_STREAM_H

#include "cuda_runtime.h"
#include <cuda_runtime.h> //system CUDA header

#ifdef __cplusplus
extern "C" {
Expand Down
7 changes: 0 additions & 7 deletions transformer_engine/common/util/handle_manager.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
/*************************************************************************
* This file was modified for portability to AMDGPU
* Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See LICENSE for license information.
Expand All @@ -11,13 +9,8 @@

#include <vector>

#ifndef __HIP_PLATFORM_AMD__
#include "cuda_runtime.h"
#include "logging.h"
#else
#include "util/cuda_runtime.h"
#include "util/logging.h"
#endif

namespace transformer_engine::detail {

Expand Down
7 changes: 3 additions & 4 deletions transformer_engine/common/util/rocm_cast_gated_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,16 @@
#pragma once

#include <cfloat>
#include <cuda.h>
#include <cuda_runtime.h>
#include <hip/hip_runtime.h>

#include "common.h"
#include "../common.h"
#include "math.h"
#include "ptx.cuh"
#include "rocm_vectorized_2d.cuh"
#include "transformer_engine/activation.h"
#include "transformer_engine/cast.h"
#include "vectorized_pointwise.h"
#include "utils.cuh"
#include "../utils.cuh"

namespace transformer_engine {
namespace gated_kernels {
Expand Down
21 changes: 10 additions & 11 deletions transformer_engine/common/util/rocm_cast_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,24 @@
#pragma once

#include <cfloat>
#include <cuda.h>
#include <cuda_runtime.h>
#include <hip/hip_runtime.h>

#include "common.h"
#include "../common.h"
#include "math.h"
#include "ptx.cuh"
#include "rocm_vectorized_2d.cuh"
#include "transformer_engine/cast.h"
#include "transpose/cast_transpose.h"
#include "../transpose/cast_transpose.h"
#include "vectorized_pointwise.h"
#include "utils.cuh"
#include "../utils.cuh"

namespace transformer_engine {

// Forward declaration, definition is in cast_kernels.cuh
template <bool IS_DBIAS, bool IS_DACT, bool IS_ACT, typename ParamOP,
float (*OP)(float, const ParamOP &)>
void mxfp8_quantize(const Tensor &input, const Tensor *act_input, const Tensor *noop,
Tensor *output, Tensor *dbias, Tensor *workspace, cudaStream_t stream);
Tensor *output, Tensor *dbias, Tensor *workspace, hipStream_t stream);


constexpr size_t MXFP8_CHUNK_DIM_Y = 64;
Expand Down Expand Up @@ -401,15 +400,15 @@ __global__ void __launch_bounds__(MXFP8_THREADS_PER_CHUNK)
// Forward declaration of functions defined in `cast_kernels.cuh`
template <typename IType>
void reduce_dbias(const float *workspace_ptr, Tensor *dbias, const size_t rows, const size_t cols,
cudaStream_t stream);
hipStream_t stream);

template <typename ParamOP, float (*OP)(float, const ParamOP &)>
void CastVectorizedUnaryKernelLauncher(const Tensor &input, const Tensor *noop, Tensor *output,
cudaStream_t stream);
hipStream_t stream);

template <typename ParamOP, float (*OP)(float, const ParamOP &)>
void CastVectorizedUnaryGradKernelLauncher(const Tensor &grad, const Tensor *input, Tensor *output,
cudaStream_t stream);
hipStream_t stream);

constexpr size_t TILE_DIM = 32;
template <typename DTypeReduce>
Expand Down Expand Up @@ -445,7 +444,7 @@ __global__ void partial_reduce_kernel(const DTypeReduce* input, float* partial_o

template <typename DTypeReduce, typename DBiasTypeOut>
void reduce_dbias_rocm(const DTypeReduce *workspace_ptr, Tensor *dbias, const size_t rows,
const size_t cols, cudaStream_t stream, Tensor* partial_sum_workspace) {
const size_t cols, hipStream_t stream, Tensor* partial_sum_workspace) {
dim3 block_dim_partial(TILE_DIM, TILE_DIM);
dim3 grid_dim_partial(DIVUP(cols, TILE_DIM), DIVUP(rows, TILE_DIM));

Expand All @@ -464,7 +463,7 @@ template <bool IS_DBIAS, bool IS_DACT, bool IS_ACT, typename ParamOP,
float (*OP)(float, const ParamOP &)>
void fp8_quantize_rocm(const Tensor &input, const Tensor *act_input, const Tensor *noop,
Tensor *output, Tensor *dbias, Tensor *workspace,
cudaStream_t stream) {
hipStream_t stream) {
switch (output->scaling_mode) {
case NVTE_DELAYED_TENSOR_SCALING: {
const size_t rows = input.flat_first_dim();
Expand Down
9 changes: 4 additions & 5 deletions transformer_engine/common/util/rocm_dequantize_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,18 @@
#pragma once

#include <cfloat>
#include <cuda.h>
#include <cuda_runtime.h>
#include <hip/hip_runtime.h>
#include <limits>

#include "common.h"
#include "../common.h"
#include "math.h"
#include "ptx.cuh"
#include "rocm_vectorized_2d.cuh"
#include "transformer_engine/activation.h"
#include "transformer_engine/cast.h"
#include "transpose/cast_transpose.h"
#include "../transpose/cast_transpose.h"
#include "transformer_engine/transpose.h"
#include "utils.cuh"
#include "../utils.cuh"
#include "vectorized_pointwise.h"

namespace transformer_engine {
Expand Down
Loading