From 7edaad893d6f3876672877129a45f4e219eeeed6 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 19 Feb 2026 17:21:56 -0600 Subject: [PATCH 1/6] Update ck_fused_attn logging to direct to thread-specific files --- .../ck_fused_attn/src/ck_fused_attn_bwd.cpp | 362 ++++++++++-------- .../ck_fused_attn/src/ck_fused_attn_fwd.cpp | 180 +++++---- .../ck_fused_attn/src/ck_fused_attn_utils.hpp | 26 +- 3 files changed, 314 insertions(+), 254 deletions(-) diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp index 3f51b96b6..3d04ead5e 100644 --- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp +++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp @@ -16,6 +16,22 @@ namespace ck_fused_attn{ +// We want to cache and reuse the log stream so we use thread_local here. +namespace { +std::ofstream* get_bwd_log_stream() { + thread_local std::ofstream log_file; + thread_local bool attempted = false; + if (!attempted) { + attempted = true; + open_ck_fused_attn_log_file(log_file, "ck_fused_attn_bwd"); + } + if (!log_file.is_open()) { + return nullptr; + } + return &log_file; +} +} // namespace + // TODO: unify with binary search in TE/common/fused_attn(rocm)/util // no device std::upper_bound // in an increasing array with given size len, search for the index that: @@ -346,110 +362,104 @@ void log_bwd_config(const char* func_name, const bool is_v3_atomic_fp32, const int how_v3_bf16_cvt, const fmha_bwd_args& fmha_args){ - - bool ck_fused_attn_log_config = false; - if (const char* env_p = std::getenv("CK_FUSED_ATTN_LOG_CONFIG") ) { - if (env_p != nullptr && std::string(env_p) == "1") - ck_fused_attn_log_config = true; - } - if (ck_fused_attn_log_config) { - std::cout<::type>(mask_type)<::type>(bias_type)<::type>(mask_type) << "\n"; + *log_file << "bias_type: " << static_cast::type>(bias_type) << "\n"; + *log_file << "has_dbias: " << has_dbias << "\n"; + *log_file << "has_dropout: " << has_dropout << "\n"; + *log_file << "is_store_randval: " << is_store_randval << "\n"; + *log_file << "is_deterministic: " << is_deterministic << "\n"; + *log_file << "uses_bwd_v3: " << uses_bwd_v3 << "\n"; + *log_file << "is_v3_atomic_fp32: " << is_v3_atomic_fp32 << "\n"; + *log_file << "how_v3_bf16_cvt: " << how_v3_bf16_cvt << "\n"; // fmha_args debug - std::cout<(std::get>(fmha_args.drop_seed_offset))<(std::get>(fmha_args.drop_seed_offset))<(std::get>(fmha_args.drop_seed_offset)) << "\n"; + *log_file << "dropout_offset_ptr: " << std::get<1>(std::get>(fmha_args.drop_seed_offset)) << "\n"; } } @@ -531,7 +541,7 @@ hipError_t ck_attn_bwd( mask_enum mask_type = static_cast(attn_mask_type); bool ck_fused_attn_log_config = false; if (const char* env_p = std::getenv("CK_FUSED_ATTN_LOG_CONFIG") ) { - if (env_p != nullptr && std::string(env_p) == "1") + if (env_p != nullptr && std::string(env_p) != "") ck_fused_attn_log_config = true; } const char* dump_path = std::getenv("NVTE_DUMP_AITER_RT"); @@ -708,17 +718,19 @@ hipError_t ck_attn_bwd( if (d_qk == d_v) { dim3 block(d_qk); if (ck_fused_attn_log_config){ - std::cout<(dbias_ptr));); }else if(bias_shape==BiasShape::k1HSS){ if (ck_fused_attn_log_config){ - std::cout<(dbias_ptr));); }else if(bias_shape==BiasShape::kB1SS){ if (ck_fused_attn_log_config){ - std::cout<::type>(mask_type)<::type>(bias_type)<::type>(mask_type) << "\n"; + *log_file << "bias_type: " << static_cast::type>(bias_type) << "\n"; + *log_file << "has_lse: " << has_lse << "\n"; + *log_file << "has_dropout: " << has_dropout << "\n"; + *log_file << "do_fp8_static_quant: " << do_fp8_static_quant << "\n"; + *log_file << "skip_min_seqlen_q: " << (fmha_args.min_seqlen_q != 0) << "\n"; + *log_file << "uses_fwd_v3: " << uses_fwd_v3 << "\n"; + *log_file << "how_v3_bf16_cvt: " << how_v3_bf16_cvt << "\n"; // debug fmha_args - std::cout<(std::get>(fmha_args.drop_seed_offset))<(std::get>(fmha_args.drop_seed_offset))<(std::get>(fmha_args.drop_seed_offset)) << "\n"; + *log_file << "dropout_offset_ptr: " << std::get<1>(std::get>(fmha_args.drop_seed_offset)) << "\n"; } } @@ -181,7 +191,7 @@ hipError_t ck_attn_fwd( bool ck_fused_attn_log_config = false; if (const char* env_p = std::getenv("CK_FUSED_ATTN_LOG_CONFIG") ) { - if (env_p != nullptr && std::string(env_p) == "1") + if (env_p != nullptr && std::string(env_p) != "") ck_fused_attn_log_config = true; } const char* dump_path = std::getenv("NVTE_DUMP_AITER_RT"); @@ -356,7 +366,7 @@ hipError_t ck_attn_varlen_fwd( bool ck_fused_attn_log_config = false; if (const char* env_p = std::getenv("CK_FUSED_ATTN_LOG_CONFIG") ) { - if (env_p != nullptr && std::string(env_p) == "1") + if (env_p != nullptr && std::string(env_p) != "") ck_fused_attn_log_config = true; } const char* dump_path = std::getenv("NVTE_DUMP_AITER_RT"); diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp index a75915ee2..b0ba19b08 100644 --- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp +++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp @@ -7,8 +7,12 @@ #ifndef CK_FUSED_ATTN_UTILS_H #define CK_FUSED_ATTN_UTILS_H -#include -#include +#include +#include +#include +#include +#include +#include #include //forward declaration for ck_tile enum @@ -56,5 +60,23 @@ std::pair get_ck_bias_type_shape(BiasType attn_bias_type, uint64_t get_runtime_max_seqlen(uint64_t b, const void* cu_seqlen_ptr, const void* cu_seqlen_padded_ptr, void* workspace, hipStream_t stream); +inline bool open_ck_fused_attn_log_file(std::ofstream& log_file, const char* file_prefix) { + const char* env_p = std::getenv("CK_FUSED_ATTN_LOG_CONFIG"); + if (env_p == nullptr) { + return false; + } + const std::string log_dir_str(env_p); + if (log_dir_str.empty() || log_dir_str == "0") { + return false; + } + std::filesystem::path log_dir(log_dir_str); + std::error_code ec; + std::filesystem::create_directories(log_dir, ec); + std::ostringstream filename; + filename << file_prefix << "_" << getpid() << "_" << std::this_thread::get_id() << ".log"; + log_file.open(log_dir / filename.str(), std::ios_base::app); + return log_file.is_open(); +} + }//namespace ck_fused_attn #endif // CK_FUSED_ATTN_UTILS_H From 13920f8012f11240a7bb017653035480ce2a9d41 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 19 Feb 2026 17:23:42 -0600 Subject: [PATCH 2/6] Added error logging --- .../common/ck_fused_attn/src/ck_fused_attn_utils.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp index b0ba19b08..10cd28bfd 100644 --- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp +++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp @@ -72,6 +72,10 @@ inline bool open_ck_fused_attn_log_file(std::ofstream& log_file, const char* fil std::filesystem::path log_dir(log_dir_str); std::error_code ec; std::filesystem::create_directories(log_dir, ec); + if(ec){ + std::cerr << "Failed to create log directory: " << log_dir_str << ", error: " << ec.message() << std::endl; + return false; + } std::ostringstream filename; filename << file_prefix << "_" << getpid() << "_" << std::this_thread::get_id() << ".log"; log_file.open(log_dir / filename.str(), std::ios_base::app); From d33b4994759482a714316b35710556239a823cf4 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 19 Feb 2026 17:26:09 -0600 Subject: [PATCH 3/6] Moved function body out of header --- .../ck_fused_attn/src/ck_fused_attn_utils.cpp | 22 +++++++++++++++++++ .../ck_fused_attn/src/ck_fused_attn_utils.hpp | 22 +------------------ 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp index 26c92ca2b..86abf4ffe 100644 --- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp +++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp @@ -13,6 +13,28 @@ namespace ck_fused_attn{ +bool open_ck_fused_attn_log_file(std::ofstream& log_file, const char* file_prefix) { + const char* env_p = std::getenv("CK_FUSED_ATTN_LOG_CONFIG"); + if (env_p == nullptr) { + return false; + } + const std::string log_dir_str(env_p); + if (log_dir_str.empty() || log_dir_str == "0") { + return false; + } + std::filesystem::path log_dir(log_dir_str); + std::error_code ec; + std::filesystem::create_directories(log_dir, ec); + if(ec){ + std::cerr << "Failed to create log directory: " << log_dir_str << ", error: " << ec.message() << std::endl; + return false; + } + std::ostringstream filename; + filename << file_prefix << "_" << getpid() << "_" << std::this_thread::get_id() << ".log"; + log_file.open(log_dir / filename.str(), std::ios_base::app); + return log_file.is_open(); +} + std::string get_data_type_str(DType dtype){ std::string data_type_str; if(dtype==DType::kFloat16){ diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp index 10cd28bfd..6b1dbe711 100644 --- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp +++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp @@ -60,27 +60,7 @@ std::pair get_ck_bias_type_shape(BiasType attn_bias_type, uint64_t get_runtime_max_seqlen(uint64_t b, const void* cu_seqlen_ptr, const void* cu_seqlen_padded_ptr, void* workspace, hipStream_t stream); -inline bool open_ck_fused_attn_log_file(std::ofstream& log_file, const char* file_prefix) { - const char* env_p = std::getenv("CK_FUSED_ATTN_LOG_CONFIG"); - if (env_p == nullptr) { - return false; - } - const std::string log_dir_str(env_p); - if (log_dir_str.empty() || log_dir_str == "0") { - return false; - } - std::filesystem::path log_dir(log_dir_str); - std::error_code ec; - std::filesystem::create_directories(log_dir, ec); - if(ec){ - std::cerr << "Failed to create log directory: " << log_dir_str << ", error: " << ec.message() << std::endl; - return false; - } - std::ostringstream filename; - filename << file_prefix << "_" << getpid() << "_" << std::this_thread::get_id() << ".log"; - log_file.open(log_dir / filename.str(), std::ios_base::app); - return log_file.is_open(); -} +bool open_ck_fused_attn_log_file(std::ofstream& log_file, const char* file_prefix); }//namespace ck_fused_attn #endif // CK_FUSED_ATTN_UTILS_H From 85a52f7e642317e931579090ba1150848bb3affa Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Fri, 20 Feb 2026 14:43:18 -0600 Subject: [PATCH 4/6] Removed risky dir-create and streamlined header --- .../common/ck_fused_attn/src/ck_fused_attn_utils.cpp | 10 ++++------ .../common/ck_fused_attn/src/ck_fused_attn_utils.hpp | 4 ---- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp index 86abf4ffe..f601bf060 100644 --- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp +++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp @@ -5,6 +5,10 @@ ************************************************************************/ #include +#include +#include +#include +#include #include "ck_fused_attn_utils.hpp" #include "ck_fused_attn/ck_fused_attn.hpp" #include "mask.hpp" @@ -23,12 +27,6 @@ bool open_ck_fused_attn_log_file(std::ofstream& log_file, const char* file_prefi return false; } std::filesystem::path log_dir(log_dir_str); - std::error_code ec; - std::filesystem::create_directories(log_dir, ec); - if(ec){ - std::cerr << "Failed to create log directory: " << log_dir_str << ", error: " << ec.message() << std::endl; - return false; - } std::ostringstream filename; filename << file_prefix << "_" << getpid() << "_" << std::this_thread::get_id() << ".log"; log_file.open(log_dir / filename.str(), std::ios_base::app); diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp index 6b1dbe711..cac1a0b9d 100644 --- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp +++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp @@ -9,10 +9,6 @@ #include #include -#include -#include -#include -#include #include //forward declaration for ck_tile enum From 11184b810d2bdd1ab18a9aa6b9d5473567f7f2b1 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Mon, 23 Feb 2026 10:27:28 -0600 Subject: [PATCH 5/6] Minor refactor --- .../ck_fused_attn/src/ck_fused_attn_bwd.cpp | 207 ++++++++---------- .../ck_fused_attn/src/ck_fused_attn_fwd.cpp | 37 ++-- .../ck_fused_attn/src/ck_fused_attn_utils.cpp | 16 +- .../ck_fused_attn/src/ck_fused_attn_utils.hpp | 3 +- 4 files changed, 126 insertions(+), 137 deletions(-) diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp index 3d04ead5e..12a9a9c0f 100644 --- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp +++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp @@ -18,16 +18,28 @@ namespace ck_fused_attn{ // We want to cache and reuse the log stream so we use thread_local here. namespace { -std::ofstream* get_bwd_log_stream() { +std::ostream* get_bwd_log_stream() { thread_local std::ofstream log_file; thread_local bool attempted = false; + thread_local bool opened = false; + thread_local bool requested = false; + thread_local std::string log_dir_str; if (!attempted) { attempted = true; - open_ck_fused_attn_log_file(log_file, "ck_fused_attn_bwd"); + if (const char* env_p = std::getenv("CK_FUSED_ATTN_LOG_CONFIG")) { + log_dir_str = std::string(env_p); + requested = !log_dir_str.empty() && log_dir_str != "0"; + } + if (requested) { + opened = open_ck_fused_attn_log_file(log_file, "ck_fused_attn_bwd", log_dir_str); + } } - if (!log_file.is_open()) { + if (!requested) { return nullptr; } + if (!opened) { + return &std::cout; + } return &log_file; } } // namespace @@ -539,15 +551,10 @@ hipError_t ck_attn_bwd( right = window_size_right; mask_enum mask_type = static_cast(attn_mask_type); - bool ck_fused_attn_log_config = false; - if (const char* env_p = std::getenv("CK_FUSED_ATTN_LOG_CONFIG") ) { - if (env_p != nullptr && std::string(env_p) != "") - ck_fused_attn_log_config = true; - } const char* dump_path = std::getenv("NVTE_DUMP_AITER_RT"); // print kernel name on verbose mode - ck_tile::stream_config stream_config{stream, dump_path!=nullptr, ck_fused_attn_log_config}; + ck_tile::stream_config stream_config{stream, dump_path!=nullptr, get_bwd_log_stream() != nullptr}; ck_tile::index_t shape_seqlen_q = seqlen_q; ck_tile::index_t shape_seqlen_k = seqlen_k; @@ -717,20 +724,18 @@ hipError_t ck_attn_bwd( dim3 grid(b, s_kv, hg); if (d_qk == d_v) { dim3 block(d_qk); - if (ck_fused_attn_log_config){ - if (auto* log_file = get_bwd_log_stream()) { - *log_file << "\n" << "run dk_dv_reduce: " << "\n"; - *log_file << "dk_expanded_ptr: " << dk_expanded_ptr << "\n"; - *log_file << "dv_expanded_ptr: " << dv_expanded_ptr << "\n"; - *log_file << "stride_b_dkv_expanded: " << stride_b_dk_expanded << "\n"; - *log_file << "stride_h_dkv_expanded: " << stride_h_dk_expanded << "\n"; - *log_file << "stride_s_dkv_expanded: " << stride_s_dk_expanded << "\n"; - *log_file << "dk_ptr: " << dk_ptr << "\n"; - *log_file << "dv_ptr: " << dv_ptr << "\n"; - *log_file << "stride_b_dk: " << stride_b_dk << "\n"; - *log_file << "stride_h_dk: " << stride_h_dk << "\n"; - *log_file << "stride_s_dk: " << stride_s_dk << "\n"; - } + if (auto* log_file = get_bwd_log_stream()) { + *log_file << "\n" << "run dk_dv_reduce: " << "\n"; + *log_file << "dk_expanded_ptr: " << dk_expanded_ptr << "\n"; + *log_file << "dv_expanded_ptr: " << dv_expanded_ptr << "\n"; + *log_file << "stride_b_dkv_expanded: " << stride_b_dk_expanded << "\n"; + *log_file << "stride_h_dkv_expanded: " << stride_h_dk_expanded << "\n"; + *log_file << "stride_s_dkv_expanded: " << stride_s_dk_expanded << "\n"; + *log_file << "dk_ptr: " << dk_ptr << "\n"; + *log_file << "dv_ptr: " << dv_ptr << "\n"; + *log_file << "stride_b_dk: " << stride_b_dk << "\n"; + *log_file << "stride_h_dk: " << stride_h_dk << "\n"; + *log_file << "stride_s_dk: " << stride_s_dk << "\n"; } CK_FUSED_ATTN_TYPE_SWITCH_16BIT(dtype, CK_TILE_TYPE, hipLaunchKernelGGL( @@ -744,18 +749,16 @@ hipError_t ck_attn_bwd( stride_b_dk, stride_h_dk, stride_s_dk);); } else { dim3 block_dk(d_qk); - if (ck_fused_attn_log_config){ - if (auto* log_file = get_bwd_log_stream()) { - *log_file << "\n" << "run dk_or_dv_reduce on dk: " << "\n"; - *log_file << "dk_expanded_ptr: " << dk_expanded_ptr << "\n"; - *log_file << "stride_b_dk_expanded: " << stride_b_dk_expanded << "\n"; - *log_file << "stride_h_dk_expanded: " << stride_h_dk_expanded << "\n"; - *log_file << "stride_s_dk_expanded: " << stride_s_dk_expanded << "\n"; - *log_file << "dk_ptr: " << dk_ptr << "\n"; - *log_file << "stride_b_dk: " << stride_b_dk << "\n"; - *log_file << "stride_h_dk: " << stride_h_dk << "\n"; - *log_file << "stride_s_dk: " << stride_s_dk << "\n"; - } + if (auto* log_file = get_bwd_log_stream()) { + *log_file << "\n" << "run dk_or_dv_reduce on dk: " << "\n"; + *log_file << "dk_expanded_ptr: " << dk_expanded_ptr << "\n"; + *log_file << "stride_b_dk_expanded: " << stride_b_dk_expanded << "\n"; + *log_file << "stride_h_dk_expanded: " << stride_h_dk_expanded << "\n"; + *log_file << "stride_s_dk_expanded: " << stride_s_dk_expanded << "\n"; + *log_file << "dk_ptr: " << dk_ptr << "\n"; + *log_file << "stride_b_dk: " << stride_b_dk << "\n"; + *log_file << "stride_h_dk: " << stride_h_dk << "\n"; + *log_file << "stride_s_dk: " << stride_s_dk << "\n"; } CK_FUSED_ATTN_TYPE_SWITCH_16BIT(dtype, CK_TILE_TYPE, hipLaunchKernelGGL( @@ -767,18 +770,16 @@ hipError_t ck_attn_bwd( stride_b_dk, stride_h_dk, stride_s_dk);); dim3 block_dv(d_v); - if (ck_fused_attn_log_config){ - if (auto* log_file = get_bwd_log_stream()) { - *log_file << "\n" << "run dk_or_dv_reduce on dv: " << "\n"; - *log_file << "dv_expanded_ptr: " << dv_expanded_ptr << "\n"; - *log_file << "stride_b_dv_expanded: " << stride_b_dv_expanded << "\n"; - *log_file << "stride_h_dv_expanded: " << stride_h_dv_expanded << "\n"; - *log_file << "stride_s_dv_expanded: " << stride_s_dv_expanded << "\n"; - *log_file << "dv_ptr: " << dv_ptr << "\n"; - *log_file << "stride_b_dv: " << stride_b_dv << "\n"; - *log_file << "stride_h_dv: " << stride_h_dv << "\n"; - *log_file << "stride_s_dv: " << stride_s_dv << "\n"; - } + if (auto* log_file = get_bwd_log_stream()) { + *log_file << "\n" << "run dk_or_dv_reduce on dv: " << "\n"; + *log_file << "dv_expanded_ptr: " << dv_expanded_ptr << "\n"; + *log_file << "stride_b_dv_expanded: " << stride_b_dv_expanded << "\n"; + *log_file << "stride_h_dv_expanded: " << stride_h_dv_expanded << "\n"; + *log_file << "stride_s_dv_expanded: " << stride_s_dv_expanded << "\n"; + *log_file << "dv_ptr: " << dv_ptr << "\n"; + *log_file << "stride_b_dv: " << stride_b_dv << "\n"; + *log_file << "stride_h_dv: " << stride_h_dv << "\n"; + *log_file << "stride_s_dv: " << stride_s_dv << "\n"; } CK_FUSED_ATTN_TYPE_SWITCH_16BIT(dtype, CK_TILE_TYPE, hipLaunchKernelGGL( @@ -797,12 +798,10 @@ hipError_t ck_attn_bwd( dim3 block(THREADS_PER_BLOCK); dim3 grid(ceil(1.0 * s_q * s_kv/THREADS_PER_BLOCK)); if(bias_shape==BiasShape::k11SS){ - if (ck_fused_attn_log_config){ - if (auto* log_file = get_bwd_log_stream()) { - *log_file << "\n" << "run dbias_reduce_11SS: " << "\n"; - *log_file << "dbias_ptr: " << dbias_ptr << "\n"; - *log_file << "dbias_expanded_ptr: " << dbias_expanded_ptr << "\n"; - } + if (auto* log_file = get_bwd_log_stream()) { + *log_file << "\n" << "run dbias_reduce_11SS: " << "\n"; + *log_file << "dbias_ptr: " << dbias_ptr << "\n"; + *log_file << "dbias_expanded_ptr: " << dbias_expanded_ptr << "\n"; } CK_FUSED_ATTN_TYPE_SWITCH_16BIT(dtype, CK_TILE_TYPE, hipLaunchKernelGGL( @@ -811,12 +810,10 @@ hipError_t ck_attn_bwd( static_cast(dbias_expanded_ptr), static_cast(dbias_ptr));); }else if(bias_shape==BiasShape::k1HSS){ - if (ck_fused_attn_log_config){ - if (auto* log_file = get_bwd_log_stream()) { - *log_file << "\n" << "run dbias_reduce_1HSS: " << "\n"; - *log_file << "dbias_ptr: " << dbias_ptr << "\n"; - *log_file << "dbias_expanded_ptr: " << dbias_expanded_ptr << "\n"; - } + if (auto* log_file = get_bwd_log_stream()) { + *log_file << "\n" << "run dbias_reduce_1HSS: " << "\n"; + *log_file << "dbias_ptr: " << dbias_ptr << "\n"; + *log_file << "dbias_expanded_ptr: " << dbias_expanded_ptr << "\n"; } CK_FUSED_ATTN_TYPE_SWITCH_16BIT(dtype, CK_TILE_TYPE, hipLaunchKernelGGL( @@ -825,12 +822,10 @@ hipError_t ck_attn_bwd( static_cast(dbias_expanded_ptr), static_cast(dbias_ptr));); }else if(bias_shape==BiasShape::kB1SS){ - if (ck_fused_attn_log_config){ - if (auto* log_file = get_bwd_log_stream()) { - *log_file << "\n" << "run dbias_reduce_B1SS: " << "\n"; - *log_file << "dbias_ptr: " << dbias_ptr << "\n"; - *log_file << "dbias_expanded_ptr: " << dbias_expanded_ptr << "\n"; - } + if (auto* log_file = get_bwd_log_stream()) { + *log_file << "\n" << "run dbias_reduce_B1SS: " << "\n"; + *log_file << "dbias_ptr: " << dbias_ptr << "\n"; + *log_file << "dbias_expanded_ptr: " << dbias_expanded_ptr << "\n"; } CK_FUSED_ATTN_TYPE_SWITCH_16BIT(dtype, CK_TILE_TYPE, hipLaunchKernelGGL( @@ -907,14 +902,9 @@ hipError_t ck_attn_varlen_bwd( right = window_size_right; mask_enum mask_type = static_cast(attn_mask_type); - bool ck_fused_attn_log_config = false; - if (const char* env_p = std::getenv("CK_FUSED_ATTN_LOG_CONFIG") ) { - if (env_p != nullptr && std::string(env_p) != "") - ck_fused_attn_log_config = true; - } const char* dump_path = std::getenv("NVTE_DUMP_AITER_RT"); // print kernel name on verbose mode - ck_tile::stream_config stream_config{stream, dump_path!=nullptr, ck_fused_attn_log_config}; + ck_tile::stream_config stream_config{stream, dump_path!=nullptr, get_bwd_log_stream() != nullptr}; std::string data_type_str = get_data_type_str(dtype); @@ -1056,8 +1046,9 @@ hipError_t ck_attn_varlen_bwd( // lse_thd_ptr used as buffer if(const char* env_p = std::getenv("NVTE_CK_RUNTIME_MAX_SEQLEN")) { if(std::string(env_p) == "1"){ - if(ck_fused_attn_log_config){ - std::cout << "attn_bwd(ck): Enabling runtime max_seqlen calculation for small seqlen optimization."; + if (auto* log_file = get_bwd_log_stream()) { + *log_file + << "attn_bwd(ck): Enabling runtime max_seqlen calculation for small seqlen optimization.\n"; } fmha_args.max_seqlen_q = get_runtime_max_seqlen(b, cu_seqlen_q_ptr, nullptr, lse_workspace_ptr, stream); fmha_args.max_seqlen_k = get_runtime_max_seqlen(b, cu_seqlen_kv_ptr, nullptr, lse_workspace_ptr, stream); @@ -1090,20 +1081,18 @@ hipError_t ck_attn_varlen_bwd( dim3 grid(max_tokens_kv, hg); if (d_qk == d_v) { dim3 block(d_qk); - if (ck_fused_attn_log_config){ - if (auto* log_file = get_bwd_log_stream()) { - *log_file << "\n" << "run dk_dv_reduce_thd: " << "\n"; - *log_file << "cu_seqlen_kv_ptr: " << cu_seqlen_kv_ptr << "\n"; - *log_file << "cu_seqlen_kv_padded_ptr: " << cu_seqlen_kv_padded_ptr << "\n"; - *log_file << "dk_expanded_ptr: " << dk_expanded_ptr << "\n"; - *log_file << "dv_expanded_ptr: " << dv_expanded_ptr << "\n"; - *log_file << "stride_h_dkv_expanded: " << stride_h_dk_expanded << "\n"; - *log_file << "stride_s_dkv_expanded: " << stride_s_dk_expanded << "\n"; - *log_file << "dk_ptr: " << dk_ptr << "\n"; - *log_file << "dv_ptr: " << dv_ptr << "\n"; - *log_file << "stride_h_dk: " << stride_h_dk << "\n"; - *log_file << "stride_s_dk: " << stride_s_dk << "\n"; - } + if (auto* log_file = get_bwd_log_stream()) { + *log_file << "\n" << "run dk_dv_reduce_thd: " << "\n"; + *log_file << "cu_seqlen_kv_ptr: " << cu_seqlen_kv_ptr << "\n"; + *log_file << "cu_seqlen_kv_padded_ptr: " << cu_seqlen_kv_padded_ptr << "\n"; + *log_file << "dk_expanded_ptr: " << dk_expanded_ptr << "\n"; + *log_file << "dv_expanded_ptr: " << dv_expanded_ptr << "\n"; + *log_file << "stride_h_dkv_expanded: " << stride_h_dk_expanded << "\n"; + *log_file << "stride_s_dkv_expanded: " << stride_s_dk_expanded << "\n"; + *log_file << "dk_ptr: " << dk_ptr << "\n"; + *log_file << "dv_ptr: " << dv_ptr << "\n"; + *log_file << "stride_h_dk: " << stride_h_dk << "\n"; + *log_file << "stride_s_dk: " << stride_s_dk << "\n"; } CK_FUSED_ATTN_TYPE_SWITCH_16BIT(dtype, CK_TILE_TYPE, hipLaunchKernelGGL( @@ -1119,18 +1108,16 @@ hipError_t ck_attn_varlen_bwd( stride_h_dk, stride_s_dk);); } else { dim3 block_dk(d_qk); - if (ck_fused_attn_log_config){ - if (auto* log_file = get_bwd_log_stream()) { - *log_file << "\n" << "run dk_or_dv_reduce_thd on dk: " << "\n"; - *log_file << "cu_seqlen_kv_ptr: " << cu_seqlen_kv_ptr << "\n"; - *log_file << "cu_seqlen_kv_padded_ptr: " << cu_seqlen_kv_padded_ptr << "\n"; - *log_file << "dk_expanded_ptr: " << dk_expanded_ptr << "\n"; - *log_file << "stride_h_dk_expanded: " << stride_h_dk_expanded << "\n"; - *log_file << "stride_s_dk_expanded: " << stride_s_dk_expanded << "\n"; - *log_file << "dk_ptr: " << dk_ptr << "\n"; - *log_file << "stride_h_dk: " << stride_h_dk << "\n"; - *log_file << "stride_s_dk: " << stride_s_dk << "\n"; - } + if (auto* log_file = get_bwd_log_stream()) { + *log_file << "\n" << "run dk_or_dv_reduce_thd on dk: " << "\n"; + *log_file << "cu_seqlen_kv_ptr: " << cu_seqlen_kv_ptr << "\n"; + *log_file << "cu_seqlen_kv_padded_ptr: " << cu_seqlen_kv_padded_ptr << "\n"; + *log_file << "dk_expanded_ptr: " << dk_expanded_ptr << "\n"; + *log_file << "stride_h_dk_expanded: " << stride_h_dk_expanded << "\n"; + *log_file << "stride_s_dk_expanded: " << stride_s_dk_expanded << "\n"; + *log_file << "dk_ptr: " << dk_ptr << "\n"; + *log_file << "stride_h_dk: " << stride_h_dk << "\n"; + *log_file << "stride_s_dk: " << stride_s_dk << "\n"; } CK_FUSED_ATTN_TYPE_SWITCH_16BIT(dtype, CK_TILE_TYPE, hipLaunchKernelGGL( @@ -1144,18 +1131,16 @@ hipError_t ck_attn_varlen_bwd( stride_h_dk, stride_s_dk);); dim3 block_dv(d_v); - if (ck_fused_attn_log_config){ - if (auto* log_file = get_bwd_log_stream()) { - *log_file << "\n" << "run dk_or_dv_reduce_thd on dv: " << "\n"; - *log_file << "cu_seqlen_kv_ptr: " << cu_seqlen_kv_ptr << "\n"; - *log_file << "cu_seqlen_kv_padded_ptr: " << cu_seqlen_kv_padded_ptr << "\n"; - *log_file << "dv_expanded_ptr: " << dv_expanded_ptr << "\n"; - *log_file << "stride_h_dv_expanded: " << stride_h_dv_expanded << "\n"; - *log_file << "stride_s_dv_expanded: " << stride_s_dv_expanded << "\n"; - *log_file << "dv_ptr: " << dv_ptr << "\n"; - *log_file << "stride_h_dv: " << stride_h_dv << "\n"; - *log_file << "stride_s_dv: " << stride_s_dv << "\n"; - } + if (auto* log_file = get_bwd_log_stream()) { + *log_file << "\n" << "run dk_or_dv_reduce_thd on dv: " << "\n"; + *log_file << "cu_seqlen_kv_ptr: " << cu_seqlen_kv_ptr << "\n"; + *log_file << "cu_seqlen_kv_padded_ptr: " << cu_seqlen_kv_padded_ptr << "\n"; + *log_file << "dv_expanded_ptr: " << dv_expanded_ptr << "\n"; + *log_file << "stride_h_dv_expanded: " << stride_h_dv_expanded << "\n"; + *log_file << "stride_s_dv_expanded: " << stride_s_dv_expanded << "\n"; + *log_file << "dv_ptr: " << dv_ptr << "\n"; + *log_file << "stride_h_dv: " << stride_h_dv << "\n"; + *log_file << "stride_s_dv: " << stride_s_dv << "\n"; } CK_FUSED_ATTN_TYPE_SWITCH_16BIT(dtype, CK_TILE_TYPE, hipLaunchKernelGGL( diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_fwd.cpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_fwd.cpp index abb49f3a5..5f231ee2d 100644 --- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_fwd.cpp +++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_fwd.cpp @@ -17,16 +17,28 @@ namespace ck_fused_attn{ namespace { -std::ofstream* get_fwd_log_stream() { +std::ostream* get_fwd_log_stream() { thread_local std::ofstream log_file; thread_local bool attempted = false; + thread_local bool opened = false; + thread_local bool requested = false; + thread_local std::string log_dir_str; if (!attempted) { attempted = true; - open_ck_fused_attn_log_file(log_file, "ck_fused_attn_fwd"); + if (const char* env_p = std::getenv("CK_FUSED_ATTN_LOG_CONFIG")) { + log_dir_str = std::string(env_p); + requested = !log_dir_str.empty() && log_dir_str != "0"; + } + if (requested) { + opened = open_ck_fused_attn_log_file(log_file, "ck_fused_attn_fwd", log_dir_str); + } } - if (!log_file.is_open()) { + if (!requested) { return nullptr; } + if (!opened) { + return &std::cout; + } return &log_file; } } // namespace @@ -189,14 +201,9 @@ hipError_t ck_attn_fwd( right = window_size_right; mask_enum mask_type = static_cast(attn_mask_type); - bool ck_fused_attn_log_config = false; - if (const char* env_p = std::getenv("CK_FUSED_ATTN_LOG_CONFIG") ) { - if (env_p != nullptr && std::string(env_p) != "") - ck_fused_attn_log_config = true; - } const char* dump_path = std::getenv("NVTE_DUMP_AITER_RT"); // print kernel name on verbose mode - ck_tile::stream_config stream_config{stream, dump_path!=nullptr, ck_fused_attn_log_config}; + ck_tile::stream_config stream_config{stream, dump_path!=nullptr, get_fwd_log_stream() != nullptr}; std::string data_type_str = get_data_type_str(dtype); @@ -364,14 +371,9 @@ hipError_t ck_attn_varlen_fwd( bias_enum bias_type = bias_enum::no_bias; - bool ck_fused_attn_log_config = false; - if (const char* env_p = std::getenv("CK_FUSED_ATTN_LOG_CONFIG") ) { - if (env_p != nullptr && std::string(env_p) != "") - ck_fused_attn_log_config = true; - } const char* dump_path = std::getenv("NVTE_DUMP_AITER_RT"); // print kernel name on verbose mode - ck_tile::stream_config stream_config{stream, dump_path!=nullptr, ck_fused_attn_log_config}; + ck_tile::stream_config stream_config{stream, dump_path!=nullptr, get_fwd_log_stream() != nullptr}; std::string data_type_str = get_data_type_str(dtype); @@ -467,8 +469,9 @@ hipError_t ck_attn_varlen_fwd( // lse_thd_ptr used as buffer if(const char* env_p = std::getenv("NVTE_CK_RUNTIME_MAX_SEQLEN")){ if(std::string(env_p) == "1"){ - if(ck_fused_attn_log_config){ - std::cout << "attn_fwd(ck): Enabling runtime max_seqlen calculation for small seqlen optimization."; + if (auto* log_file = get_fwd_log_stream()) { + *log_file + << "attn_fwd(ck): Enabling runtime max_seqlen calculation for small seqlen optimization.\n"; } fmha_args.max_seqlen_q = get_runtime_max_seqlen(b, cu_seqlen_q_ptr, cu_seqlen_q_padded_ptr, lse_thd_ptr, stream); } diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp index f601bf060..c1361a6f1 100644 --- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp +++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp @@ -17,20 +17,20 @@ namespace ck_fused_attn{ -bool open_ck_fused_attn_log_file(std::ofstream& log_file, const char* file_prefix) { - const char* env_p = std::getenv("CK_FUSED_ATTN_LOG_CONFIG"); - if (env_p == nullptr) { - return false; - } - const std::string log_dir_str(env_p); - if (log_dir_str.empty() || log_dir_str == "0") { +bool open_ck_fused_attn_log_file(std::ofstream& log_file, const char* file_prefix, const std::string& log_dir_str) { + // Explicitly use std::cout as a fallback + if (log_dir_str == "1") { return false; } std::filesystem::path log_dir(log_dir_str); std::ostringstream filename; filename << file_prefix << "_" << getpid() << "_" << std::this_thread::get_id() << ".log"; log_file.open(log_dir / filename.str(), std::ios_base::app); - return log_file.is_open(); + if (!log_file.is_open()) { + std::cerr << "Failed to open log file: " << (log_dir / filename.str()) << "\n"; + return false; + } + return true; } std::string get_data_type_str(DType dtype){ diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp index cac1a0b9d..13e3d3c0a 100644 --- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp +++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp @@ -9,6 +9,7 @@ #include #include +#include #include //forward declaration for ck_tile enum @@ -56,7 +57,7 @@ std::pair get_ck_bias_type_shape(BiasType attn_bias_type, uint64_t get_runtime_max_seqlen(uint64_t b, const void* cu_seqlen_ptr, const void* cu_seqlen_padded_ptr, void* workspace, hipStream_t stream); -bool open_ck_fused_attn_log_file(std::ofstream& log_file, const char* file_prefix); +bool open_ck_fused_attn_log_file(std::ofstream& log_file, const char* file_prefix, const std::string& log_dir_str); }//namespace ck_fused_attn #endif // CK_FUSED_ATTN_UTILS_H From ce12f95ad2a72b81f3222fcc4810eb33eb47039f Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Mon, 23 Feb 2026 10:42:10 -0600 Subject: [PATCH 6/6] Copyright --- .../common/ck_fused_attn/src/ck_fused_attn_bwd.cpp | 2 +- .../common/ck_fused_attn/src/ck_fused_attn_fwd.cpp | 2 +- .../common/ck_fused_attn/src/ck_fused_attn_utils.cpp | 2 +- .../common/ck_fused_attn/src/ck_fused_attn_utils.hpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp index 12a9a9c0f..0bd062f6b 100644 --- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp +++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_bwd.cpp @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. * * License for AMD contributions = MIT. See LICENSE for more information ************************************************************************/ diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_fwd.cpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_fwd.cpp index 5f231ee2d..935caa9eb 100644 --- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_fwd.cpp +++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_fwd.cpp @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. * * License for AMD contributions = MIT. See LICENSE for more information ************************************************************************/ diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp index c1361a6f1..6bbfbda4f 100644 --- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp +++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.cpp @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. * * License for AMD contributions = MIT. See LICENSE for more information ************************************************************************/ diff --git a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp index 13e3d3c0a..a0ea13d81 100644 --- a/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp +++ b/transformer_engine/common/ck_fused_attn/src/ck_fused_attn_utils.hpp @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. * * License for AMD contributions = MIT. See LICENSE for more information ************************************************************************/