From 50e7d69adb25fba79b6dbcd42a6c6dba1fe21f02 Mon Sep 17 00:00:00 2001 From: luzhan Date: Fri, 13 Sep 2024 11:22:15 +0800 Subject: [PATCH 1/3] prof: add llama3-8b test script --- examples/llama3-8b.sh | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 examples/llama3-8b.sh diff --git a/examples/llama3-8b.sh b/examples/llama3-8b.sh new file mode 100644 index 0000000..60f9d21 --- /dev/null +++ b/examples/llama3-8b.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# [Note]: Commands in this script should be run under FlashModels Folder +# bash ../scripts/launch-training-torchacc.sh + + +# ========= seq_len=2048 mbs=1 python-fsdp=8 ========= +./examples/run.sh --model ./hf_models/config/llama-3-8b --accelerator acc --mbs 1 --fsdp 8 --max_seq_length 2048 + +# ========= seq_len=8192 mbs=1 spmd-fsdp=8 ========= +XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=./hlo/llama-3-8b-bs1-spmd-2409041135 --xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner" \ +XLA_USE_SPMD=1 ./examples/run.sh --model ./hf_models/config/llama-3-8b --accelerator acc --mbs 1 --fsdp 8 --spmd_fsdp --max_seq_length 8192 + +# ========= seq_len=8192 mbs=1 spmd-fsdp=8 profile ========= +XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=./hlo/llama-3-8b-bs2-best --xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner --xla_gpu_memory_limit_slop_factor=95" \ +XLA_USE_SPMD=1 ./examples/run.sh --model ./hf_models/config/llama-3-8b --accelerator acc --mbs 2 --fsdp 8 --spmd_fsdp --max_seq_length 8192 --gc --gc_cnt 1 --profile + +# ========= seq_len=8192 mbs=2 spmd-fsdp=8 ========= +XLA_FLAGS="--xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner --xla_gpu_memory_limit_slop_factor=97" \ +XLA_USE_SPMD=1 ./examples/run.sh --model ./hf_models/config/llama-3-8b --accelerator acc --mbs 2 --fsdp 8 --spmd_fsdp --max_seq_length 8192 --gc --gc_cnt 1 # OPTIMAL + +# ========= seq_len=8192 mbs=2 python-fsdp=8 ========= +./examples/run.sh --model ./hf_models/config/llama-3-8b --accelerator acc --mbs 2 --fsdp 8 --max_seq_length 8192 --gc --gc_cnt 9 From 6304cd715392bb79262aaa85c0f47d6e1f284eaa Mon Sep 17 00:00:00 2001 From: luzhan Date: Fri, 13 Sep 2024 11:30:37 +0800 Subject: [PATCH 2/3] prof: add llama3-70b test script --- examples/llama3-70b.sh | 11 +++++++++++ examples/llama3-8b.sh | 5 ----- 2 files changed, 11 insertions(+), 5 deletions(-) create mode 100644 examples/llama3-70b.sh diff --git a/examples/llama3-70b.sh b/examples/llama3-70b.sh new file mode 100644 index 0000000..f893b19 --- /dev/null +++ b/examples/llama3-70b.sh @@ -0,0 +1,11 @@ +# ========= seq_len=8192 mbs=1 python-fsdp=32 ========= +LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.95 XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=./hlo/llama3-70b-bs1-pythonfsdp-2409041559 --xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner --xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=4294967296" \ +./examples/run.sh --model ./hf_models/config/llama-3-70b --accelerator acc --mbs 1 --fsdp 32 --max_seq_length 8192 --gc --gc_cnt 80 + +# ========= seq_len=8192 mbs=1 spmd-fsdp=32 ========= +LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.95 XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=./hlo/llama3-70b-bs1-spmd-2409041559 --xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner --xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=4294967296" \ +XLA_USE_SPMD=1 ./examples/run.sh --model ./hf_models/config/llama-3-70b --accelerator acc --mbs 1 --fsdp 32 --spmd_fsdp --max_seq_length 8192 --gc --gc_cnt 80 + +# ========= seq_len=8192 mbs=2 python-fsdp=32 ========= +LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.90 XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=./hlo/llama3-70b-bs2-pythonfsdp-2409061648 --xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=8589934592" \ +./examples/run.sh --model ./hf_models/config/llama-3-70b --accelerator acc --mbs 2 --fsdp 32 --max_seq_length 8192 --gc --gc_cnt 80 # OPTIMAL \ No newline at end of file diff --git a/examples/llama3-8b.sh b/examples/llama3-8b.sh index 60f9d21..df6425d 100644 --- a/examples/llama3-8b.sh +++ b/examples/llama3-8b.sh @@ -1,8 +1,3 @@ -#!/bin/bash -# [Note]: Commands in this script should be run under FlashModels Folder -# bash ../scripts/launch-training-torchacc.sh - - # ========= seq_len=2048 mbs=1 python-fsdp=8 ========= ./examples/run.sh --model ./hf_models/config/llama-3-8b --accelerator acc --mbs 1 --fsdp 8 --max_seq_length 2048 From 920aa7a623fb69142ace2aed9312d7741a6b44dd Mon Sep 17 00:00:00 2001 From: luzhan Date: Mon, 23 Sep 2024 10:55:19 +0800 Subject: [PATCH 3/3] prof: remove dump hlo flag and add llama3-70b spmd-fsdp mbs=2 --- examples/llama3-70b.sh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/examples/llama3-70b.sh b/examples/llama3-70b.sh index f893b19..8b6de9c 100644 --- a/examples/llama3-70b.sh +++ b/examples/llama3-70b.sh @@ -1,11 +1,15 @@ # ========= seq_len=8192 mbs=1 python-fsdp=32 ========= -LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.95 XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=./hlo/llama3-70b-bs1-pythonfsdp-2409041559 --xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner --xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=4294967296" \ +LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.95 XLA_FLAGS="--xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner --xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=4294967296" \ ./examples/run.sh --model ./hf_models/config/llama-3-70b --accelerator acc --mbs 1 --fsdp 32 --max_seq_length 8192 --gc --gc_cnt 80 # ========= seq_len=8192 mbs=1 spmd-fsdp=32 ========= -LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.95 XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=./hlo/llama3-70b-bs1-spmd-2409041559 --xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner --xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=4294967296" \ +LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.95 XLA_FLAGS="--xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner --xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=4294967296" \ XLA_USE_SPMD=1 ./examples/run.sh --model ./hf_models/config/llama-3-70b --accelerator acc --mbs 1 --fsdp 32 --spmd_fsdp --max_seq_length 8192 --gc --gc_cnt 80 # ========= seq_len=8192 mbs=2 python-fsdp=32 ========= -LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.90 XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=./hlo/llama3-70b-bs2-pythonfsdp-2409061648 --xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=8589934592" \ -./examples/run.sh --model ./hf_models/config/llama-3-70b --accelerator acc --mbs 2 --fsdp 32 --max_seq_length 8192 --gc --gc_cnt 80 # OPTIMAL \ No newline at end of file +LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.90 XLA_FLAGS="--xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=8589934592" \ +./examples/run.sh --model ./hf_models/config/llama-3-70b --accelerator acc --mbs 2 --fsdp 32 --max_seq_length 8192 --gc --gc_cnt 80 + +# ========= seq_len=8192 mbs=2 spmd-fsdp=32 ========= +LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.95 XLA_FLAGS="--xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner --xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=4294967296" \ +XLA_USE_SPMD=1 ./examples/run.sh --model ./hf_models/config/llama-3-70b --accelerator acc --mbs 2 --fsdp 32 --spmd_fsdp --max_seq_length 8192 --gc --gc_cnt 80 # OPTIMAL \ No newline at end of file