From 7dc0eaed73b78a9c4bbec0ab257bdff5809ed63f Mon Sep 17 00:00:00 2001 From: Rita Date: Thu, 20 Nov 2025 16:21:09 +0800 Subject: [PATCH 1/4] Modify pip install commands in README Updated installation commands to include --no-build-isolation flag. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 773ebc81..1692c717 100644 --- a/README.md +++ b/README.md @@ -314,12 +314,12 @@ bash prerequisite.sh ```bash cd megatron/shm_tensor_new_rdma -pip install -e . +pip install -e . --no-build-isolation ``` ```bash cd megatron/shm_tensor_new_rdma_pre_alloc -pip install -e . +pip install -e . --no-build-isolation ``` ### Run From ecd69f7bfb711c2f815693e6866da1e649031b21 Mon Sep 17 00:00:00 2001 From: Rita Date: Thu, 20 Nov 2025 16:28:16 +0800 Subject: [PATCH 2/4] Refactor setup.py to use common compile and link args --- megatron/shm_tensor_new_rdma/setup.py | 46 +++++++++++++++++++-------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/megatron/shm_tensor_new_rdma/setup.py b/megatron/shm_tensor_new_rdma/setup.py index 1f33546a..2316bc53 100644 --- a/megatron/shm_tensor_new_rdma/setup.py +++ b/megatron/shm_tensor_new_rdma/setup.py @@ -1,5 +1,26 @@ from setuptools import setup from torch.utils.cpp_extension import CppExtension, BuildExtension, include_paths +import torch + +abi_flag = getattr(torch._C, "_GLIBCXX_USE_CXX11_ABI", None) +if abi_flag is None: + abi_flag = 1 + +abi_macro = f"-D_GLIBCXX_USE_CXX11_ABI={int(abi_flag)}" + +common_extra_compile_args = [ + "-fPIC", + "-std=c++17", + abi_macro, + "-I/usr/local/cuda/include", +] + +common_extra_link_args = [ + "-Wl,-rpath,$ORIGIN", + "-L/usr/local/cuda/lib64", + "-lcudart", +] + setup( name="shm_tensor_new_rdma", @@ -9,19 +30,18 @@ sources=["shm_tensor_new_rdma.cpp"], include_dirs=include_paths(), libraries=["rdmacm", "ibverbs", "torch", "torch_python", "c10"], - extra_compile_args=[ - "-fPIC", - "-std=c++17", - "-D_GLIBCXX_USE_CXX11_ABI=0", - "-I/usr/local/cuda/include", - ], - extra_link_args=[ - "-Wl,-rpath,$ORIGIN", - "-L/usr/local/cuda/lib64", - "-lcudart", - ], - ) + extra_compile_args=common_extra_compile_args, + extra_link_args=common_extra_link_args, + ), + CppExtension( + name="shm_tensor_new_rdma_pre_alloc", + sources=["shm_tensor_new_rdma_pre_alloc.cpp"], + include_dirs=include_paths(), + libraries=["rdmacm", "ibverbs", "torch", "torch_python", "c10"], + extra_compile_args=common_extra_compile_args, + extra_link_args=common_extra_link_args, + ), ], cmdclass={"build_ext": BuildExtension}, packages=[], -) \ No newline at end of file +) From b0f4ee0ab398109db0e2378f408914beddc20f80 Mon Sep 17 00:00:00 2001 From: Rita Date: Thu, 20 Nov 2025 16:28:46 +0800 Subject: [PATCH 3/4] Refactor setup.py for shm_tensor_new_rdma --- .../shm_tensor_new_rdma_pre_alloc/setup.py | 46 +++++++++++++------ 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/megatron/shm_tensor_new_rdma_pre_alloc/setup.py b/megatron/shm_tensor_new_rdma_pre_alloc/setup.py index d18b8c0b..d764992e 100644 --- a/megatron/shm_tensor_new_rdma_pre_alloc/setup.py +++ b/megatron/shm_tensor_new_rdma_pre_alloc/setup.py @@ -14,27 +14,47 @@ from setuptools import setup from torch.utils.cpp_extension import CppExtension, BuildExtension, include_paths +import torch + +abi_flag = getattr(torch._C, "_GLIBCXX_USE_CXX11_ABI", None) +if abi_flag is None: + abi_flag = 1 + +abi_macro = f"-D_GLIBCXX_USE_CXX11_ABI={int(abi_flag)}" + +common_extra_compile_args = [ + "-fPIC", + "-std=c++17", + abi_macro, + "-I/usr/local/cuda/include", +] + +common_extra_link_args = [ + "-Wl,-rpath,$ORIGIN", + "-L/usr/local/cuda/lib64", + "-lcudart", +] + setup( - name="shm_tensor_new_rdma_pre_alloc", + name="shm_tensor_new_rdma", ext_modules=[ + CppExtension( + name="shm_tensor_new_rdma", + sources=["shm_tensor_new_rdma.cpp"], + include_dirs=include_paths(), + libraries=["rdmacm", "ibverbs", "torch", "torch_python", "c10"], + extra_compile_args=common_extra_compile_args, + extra_link_args=common_extra_link_args, + ), CppExtension( name="shm_tensor_new_rdma_pre_alloc", sources=["shm_tensor_new_rdma_pre_alloc.cpp"], include_dirs=include_paths(), libraries=["rdmacm", "ibverbs", "torch", "torch_python", "c10"], - extra_compile_args=[ - "-fPIC", - "-std=c++20", - "-D_GLIBCXX_USE_CXX11_ABI=0", - "-I/usr/local/cuda/include", - ], - extra_link_args=[ - "-Wl,-rpath,$ORIGIN", - "-L/usr/local/cuda/lib64", - "-lcudart", - ], - ) + extra_compile_args=common_extra_compile_args, + extra_link_args=common_extra_link_args, + ), ], cmdclass={"build_ext": BuildExtension}, packages=[], From b598fbbf547dda8909422e83a6f737d53c43ea94 Mon Sep 17 00:00:00 2001 From: Rita Date: Thu, 20 Nov 2025 16:39:20 +0800 Subject: [PATCH 4/4] Add serialization_format support to _write_item --- .../dist_checkpointing/strategies/filesystem_async.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/megatron/core/dist_checkpointing/strategies/filesystem_async.py b/megatron/core/dist_checkpointing/strategies/filesystem_async.py index 967856d9..97871b45 100644 --- a/megatron/core/dist_checkpointing/strategies/filesystem_async.py +++ b/megatron/core/dist_checkpointing/strategies/filesystem_async.py @@ -313,18 +313,23 @@ def write_preloaded_data( mem_before = _process_memory() local_results = [] + extra_kwargs = {} try: + import inspect + if "serialization_format" in inspect.signature(_write_item).parameters: + from torch.distributed.checkpoint.filesystem import SerializationFormat + extra_kwargs["serialization_format"] = SerializationFormat.TORCH_SAVE file_name, storage_key, (bytes_data, tensor_data) = write_bucket with open(file_name, "wb") as stream: for write_item, data in bytes_data: local_results.append( - _write_item(*transform_list, stream, data, write_item, storage_key) + _write_item(*transform_list, stream, data, write_item, storage_key, **extra_kwargs) ) for write_item, tensor in tensor_data: assert tensor.is_cpu local_results.append( - _write_item(*transform_list, stream, tensor, write_item, storage_key) + _write_item(*transform_list, stream, tensor, write_item, storage_key, **extra_kwargs) ) if use_fsync: