diff --git a/3rdparty/hipify_torch b/3rdparty/hipify_torch index 3456cd19d..e2875e28a 160000 --- a/3rdparty/hipify_torch +++ b/3rdparty/hipify_torch @@ -1 +1 @@ -Subproject commit 3456cd19d4eb5e469317bfcfae1a89b7ab70f6c2 +Subproject commit e2875e28a397cf4ec13c7414d4387fe761f36e92 diff --git a/build_tools/utils.py b/build_tools/utils.py index 20d508a6e..c6da47d0f 100644 --- a/build_tools/utils.py +++ b/build_tools/utils.py @@ -502,30 +502,17 @@ def uninstall_te_wheel_packages(): ] ) -def detect_hipify_v2(): - try: - from torch.utils.hipify import __version__ - from packaging.version import Version - if Version(__version__) >= Version("2.0.0"): - return True - except Exception as e: - print("failed to detect pytorch hipify version, defaulting to version 1.0.0 behavior") - print(e) - return False - def hipify(base_dir, src_dir, sources, include_dirs): cwd = os.getcwd() - if detect_hipify_v2(): - hipify_module = importlib.import_module("3rdparty.hipify_torch.hipify_torch.v2.hipify_python") - else: - hipify_module = importlib.import_module("3rdparty.hipify_torch.hipify_torch.hipify_python") + hipify_module = importlib.import_module("3rdparty.hipify_torch.hipify_torch.v2.hipify_python") do_hipify = hipify_module.hipify + print(f"Run hipify on {src_dir}") hipify_result = do_hipify( project_directory=src_dir, output_directory=src_dir, - includes=["*"], - ignores=["*/amd_detail/*", "*/aotriton/*", "*/ck_fused_attn/*"], + includes=["*/common/*", str(src_dir)+"/*"], + ignores=["*/amd_detail/*", "*/aotriton/*", "*/ck_fused_attn/*", "*/rocshmem_api/*"], header_include_dirs=[d for d in include_dirs if Path(d).is_relative_to(base_dir)], custom_map_list=base_dir / "hipify_custom_map.json", extra_files=[], diff --git a/hipify_custom_map.json b/hipify_custom_map.json index 97824bbdb..35e6b7999 100644 --- a/hipify_custom_map.json +++ b/hipify_custom_map.json @@ -1,8 +1,8 @@ { "custom_map" : { "" : "", - "" : "\"amd_detail/hip_float8.h\"", - "util/cuda_runtime.h" : "util/hip_runtime.h", + "" : "\"common/amd_detail/hip_float8.h\"", + "cuda_runtime.h\"" : "hip_runtime.h\"", "ATen/cudnn/Handle.h" : "ATen/miopen/Handle.h", "CUfunc_cache" : "hipFuncCache_t", "" : "", diff --git a/setup.py b/setup.py index 2f3e3c2ab..cd5cacc70 100644 --- a/setup.py +++ b/setup.py @@ -49,7 +49,10 @@ def run(self): if rocm_build(): print("Running hipification of installable headers for ROCm build...") common_headers_dir = current_file_path / "transformer_engine/common/include" - hipify(current_file_path, common_headers_dir, all_files_in_dir(common_headers_dir), []) + #TODO: some installable headers refer non installable headers (i.e not from common/include) + #so we need add extra include paths here to match hipification results with build process + hipify(current_file_path, common_headers_dir, all_files_in_dir(common_headers_dir), + [common_headers_dir, current_file_path / "transformer_engine"]) super().run() CMakeBuildExtension = get_build_ext(BuildExtension) diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt index b71addebf..57cdb02fe 100644 --- a/tests/cpp/CMakeLists.txt +++ b/tests/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ # This file was modified for portability to AMDGPU -# Copyright (c) 2022-2025, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2022-2026, Advanced Micro Devices, Inc. All rights reserved. # Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # See LICENSE for license information. @@ -96,5 +96,34 @@ else() find_package(hip REQUIRED) endif() +if(USE_ROCM) + set(TE ${CMAKE_CURRENT_SOURCE_DIR}/../..) + set(THIRDPARTY ${TE}/3rdparty) + list(APPEND CMAKE_MODULE_PATH "${THIRDPARTY}/hipify_torch/cmake") + include(Hipify) + message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}") + + #hipify parameters should match those used in TE CMakeLists.txt + set(header_include_dir + ${TE}/transformer_engine/common/include + ${TE}/transformer_engine/common + ${TE}/transformer_engine) + + set(cuda_source_dir ${PROJECT_SOURCE_DIR} ) + message(STATUS "Run hipify on ${cuda_source_dir}") + hipify(CUDA_SOURCE_DIR ${cuda_source_dir} + HEADER_INCLUDE_DIR ${header_include_dir} + IGNORES "*/pytorch/csrc/*" + IGNORES "*/jax/csrc/*" + IGNORES "*/amd_detail/*" + IGNORES "*/aotriton/*" + IGNORES "*/ck_fused_attn/*" + IGNORES "*/rocshmem_api/*" + CUSTOM_MAP_FILE "${TE}/hipify_custom_map.json" + NO_MATH_REPLACE + V2 + ) +endif() + add_subdirectory(operator) add_subdirectory(util) diff --git a/tests/cpp/operator/CMakeLists.txt b/tests/cpp/operator/CMakeLists.txt index ebee930a1..d4beeb87c 100644 --- a/tests/cpp/operator/CMakeLists.txt +++ b/tests/cpp/operator/CMakeLists.txt @@ -41,31 +41,9 @@ endif() if(USE_CUDA) add_executable(test_operator ${test_cuda_sources}) else() - message("${message_line}") - message(STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}") - message(STATUS "PROJECT_SOURCE_DIR: ${PROJECT_SOURCE_DIR}") - - set(TE ${CMAKE_CURRENT_SOURCE_DIR}/../../..) - set(THIRDPARTY ${TE}/3rdparty) - list(APPEND CMAKE_MODULE_PATH "${THIRDPARTY}/hipify_torch/cmake") - include(Hipify) - message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}") - - file(REAL_PATH ../../../transformer_engine/common/include header_include_dir1) - file(REAL_PATH ../../../transformer_engine/common header_include_dir2) - set(header_include_dir ${header_include_dir1} ${header_include_dir2}) - - message(STATUS "CUDA_SOURCE_DIR: ${PROJECT_SOURCE_DIR}") - message(STATUS "HEADER_INCLUDE_DIR: ${header_include_dir}") - set(cuda_source_dir ${PROJECT_SOURCE_DIR} ) - hipify(CUDA_SOURCE_DIR ${cuda_source_dir} - HEADER_INCLUDE_DIR ${header_include_dir} - CUSTOM_MAP_FILE "${TE}/hipify_custom_map.json" - NO_MATH_REPLACE - ) get_hipified_list("${test_cuda_sources}" test_hip_sources) message("${message_line}") - message(STATUS "nvte tests hipified sources: ${test_hip_sources}") + message(STATUS "test_operator hipified sources: ${test_hip_sources}") add_executable(test_operator ${test_hip_sources}) endif() diff --git a/tests/cpp/util/CMakeLists.txt b/tests/cpp/util/CMakeLists.txt index 51c855a91..ea986bfa5 100644 --- a/tests/cpp/util/CMakeLists.txt +++ b/tests/cpp/util/CMakeLists.txt @@ -4,16 +4,18 @@ # # See LICENSE for license information. -if(USE_CUDA) -add_executable(test_util +list(APPEND test_cuda_sources test_nvrtc.cpp test_string.cpp ../test_common.cu) -else() -add_executable(test_util - test_nvrtc_hip.cpp - test_string.cpp - ../test_common.hip) +if(USE_CUDA) + add_executable(test_util ${test_cuda_sources}) +else() + get_hipified_list("${test_cuda_sources}" test_hip_sources) + message("${message_line}") + message(STATUS "test_util hipified sources: ${test_hip_sources}") + + add_executable(test_util ${test_hip_sources}) endif() find_package(OpenMP REQUIRED) diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt index 50dcf90a0..ec0db3a66 100644 --- a/transformer_engine/common/CMakeLists.txt +++ b/transformer_engine/common/CMakeLists.txt @@ -1,5 +1,5 @@ # This file was modified for portability to AMDGPU -# Copyright (c) 2022-2025, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2022-2026, Advanced Micro Devices, Inc. All rights reserved. # Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # See LICENSE for license information. @@ -211,13 +211,13 @@ else() include(Hipify) message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}") + # Parameters for hipify here should match hipify from build_tools/utils.py + # Because CMake hipify doesn't support INCLUDES we explicitly add pytorch and jax to IGNORES set(header_include_dir ${CMAKE_CURRENT_SOURCE_DIR}/.. - ${CMAKE_CURRENT_SOURCE_DIR}/include - ${CMAKE_CURRENT_SOURCE_DIR}/util + ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}) - message(STATUS "HIPIFY CUDA_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}") - message(STATUS "HIPIFY HEADER_INCLUDE_DIR: ${header_include_dir}") + message(STATUS "Run hipify on ${CMAKE_CURRENT_SOURCE_DIR}") hipify(CUDA_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR} HEADER_INCLUDE_DIR ${header_include_dir} IGNORES "*/amd_detail/*" @@ -228,13 +228,13 @@ else() IGNORES "*/rocshmem_api/*" CUSTOM_MAP_FILE "${TE}/hipify_custom_map.json" NO_MATH_REPLACE + V2 ) get_hipified_list("${transformer_engine_SOURCES}" te_hip_sources) message("${message_line}") message(STATUS "nvte hipified sources: ${te_hip_sources}") add_library(transformer_engine SHARED ${te_hip_sources}) - target_include_directories(transformer_engine PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}") endif() target_include_directories(transformer_engine PUBLIC diff --git a/transformer_engine/common/include/transformer_engine/multi_stream.h b/transformer_engine/common/include/transformer_engine/multi_stream.h index e406a0786..ef786564a 100644 --- a/transformer_engine/common/include/transformer_engine/multi_stream.h +++ b/transformer_engine/common/include/transformer_engine/multi_stream.h @@ -1,4 +1,6 @@ /************************************************************************* + * This file was modified for portability to AMDGPU + * Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See LICENSE for license information. @@ -11,7 +13,7 @@ #ifndef TRANSFORMER_ENGINE_MULTI_STREAM_H #define TRANSFORMER_ENGINE_MULTI_STREAM_H -#include "cuda_runtime.h" +#include //system CUDA header #ifdef __cplusplus extern "C" { diff --git a/transformer_engine/common/util/handle_manager.h b/transformer_engine/common/util/handle_manager.h index a63cd61c3..adb2f5558 100644 --- a/transformer_engine/common/util/handle_manager.h +++ b/transformer_engine/common/util/handle_manager.h @@ -1,6 +1,4 @@ /************************************************************************* - * This file was modified for portability to AMDGPU - * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See LICENSE for license information. @@ -11,13 +9,8 @@ #include -#ifndef __HIP_PLATFORM_AMD__ #include "cuda_runtime.h" #include "logging.h" -#else -#include "util/cuda_runtime.h" -#include "util/logging.h" -#endif namespace transformer_engine::detail { diff --git a/transformer_engine/common/util/rocm_cast_gated_kernels.cuh b/transformer_engine/common/util/rocm_cast_gated_kernels.cuh index a53fd51c5..387445a78 100644 --- a/transformer_engine/common/util/rocm_cast_gated_kernels.cuh +++ b/transformer_engine/common/util/rocm_cast_gated_kernels.cuh @@ -7,17 +7,16 @@ #pragma once #include -#include -#include +#include -#include "common.h" +#include "../common.h" #include "math.h" #include "ptx.cuh" #include "rocm_vectorized_2d.cuh" #include "transformer_engine/activation.h" #include "transformer_engine/cast.h" #include "vectorized_pointwise.h" -#include "utils.cuh" +#include "../utils.cuh" namespace transformer_engine { namespace gated_kernels { diff --git a/transformer_engine/common/util/rocm_cast_kernels.cuh b/transformer_engine/common/util/rocm_cast_kernels.cuh index eb0c9b94d..33c53e8e8 100644 --- a/transformer_engine/common/util/rocm_cast_kernels.cuh +++ b/transformer_engine/common/util/rocm_cast_kernels.cuh @@ -6,17 +6,16 @@ #pragma once #include -#include -#include +#include -#include "common.h" +#include "../common.h" #include "math.h" #include "ptx.cuh" #include "rocm_vectorized_2d.cuh" #include "transformer_engine/cast.h" -#include "transpose/cast_transpose.h" +#include "../transpose/cast_transpose.h" #include "vectorized_pointwise.h" -#include "utils.cuh" +#include "../utils.cuh" namespace transformer_engine { @@ -24,7 +23,7 @@ namespace transformer_engine { template void mxfp8_quantize(const Tensor &input, const Tensor *act_input, const Tensor *noop, - Tensor *output, Tensor *dbias, Tensor *workspace, cudaStream_t stream); + Tensor *output, Tensor *dbias, Tensor *workspace, hipStream_t stream); constexpr size_t MXFP8_CHUNK_DIM_Y = 64; @@ -401,15 +400,15 @@ __global__ void __launch_bounds__(MXFP8_THREADS_PER_CHUNK) // Forward declaration of functions defined in `cast_kernels.cuh` template void reduce_dbias(const float *workspace_ptr, Tensor *dbias, const size_t rows, const size_t cols, - cudaStream_t stream); + hipStream_t stream); template void CastVectorizedUnaryKernelLauncher(const Tensor &input, const Tensor *noop, Tensor *output, - cudaStream_t stream); + hipStream_t stream); template void CastVectorizedUnaryGradKernelLauncher(const Tensor &grad, const Tensor *input, Tensor *output, - cudaStream_t stream); + hipStream_t stream); constexpr size_t TILE_DIM = 32; template @@ -445,7 +444,7 @@ __global__ void partial_reduce_kernel(const DTypeReduce* input, float* partial_o template void reduce_dbias_rocm(const DTypeReduce *workspace_ptr, Tensor *dbias, const size_t rows, - const size_t cols, cudaStream_t stream, Tensor* partial_sum_workspace) { + const size_t cols, hipStream_t stream, Tensor* partial_sum_workspace) { dim3 block_dim_partial(TILE_DIM, TILE_DIM); dim3 grid_dim_partial(DIVUP(cols, TILE_DIM), DIVUP(rows, TILE_DIM)); @@ -464,7 +463,7 @@ template void fp8_quantize_rocm(const Tensor &input, const Tensor *act_input, const Tensor *noop, Tensor *output, Tensor *dbias, Tensor *workspace, - cudaStream_t stream) { + hipStream_t stream) { switch (output->scaling_mode) { case NVTE_DELAYED_TENSOR_SCALING: { const size_t rows = input.flat_first_dim(); diff --git a/transformer_engine/common/util/rocm_dequantize_kernels.cuh b/transformer_engine/common/util/rocm_dequantize_kernels.cuh index 398e4c0ad..0d020b5eb 100644 --- a/transformer_engine/common/util/rocm_dequantize_kernels.cuh +++ b/transformer_engine/common/util/rocm_dequantize_kernels.cuh @@ -7,19 +7,18 @@ #pragma once #include -#include -#include +#include #include -#include "common.h" +#include "../common.h" #include "math.h" #include "ptx.cuh" #include "rocm_vectorized_2d.cuh" #include "transformer_engine/activation.h" #include "transformer_engine/cast.h" -#include "transpose/cast_transpose.h" +#include "../transpose/cast_transpose.h" #include "transformer_engine/transpose.h" -#include "utils.cuh" +#include "../utils.cuh" #include "vectorized_pointwise.h" namespace transformer_engine {