From 50e7d69adb25fba79b6dbcd42a6c6dba1fe21f02 Mon Sep 17 00:00:00 2001
From: luzhan <luzhan.lz@alibaba-inc.com>
Date: Fri, 13 Sep 2024 11:22:15 +0800
Subject: [PATCH 1/3] prof: add llama3-8b test script

---
 examples/llama3-8b.sh | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 examples/llama3-8b.sh

diff --git a/examples/llama3-8b.sh b/examples/llama3-8b.sh
new file mode 100644
index 0000000..60f9d21
--- /dev/null
+++ b/examples/llama3-8b.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# [Note]: Commands in this script should be run under FlashModels Folder
+# bash ../scripts/launch-training-torchacc.sh
+
+
+# ========= seq_len=2048 mbs=1 python-fsdp=8 =========
+./examples/run.sh --model ./hf_models/config/llama-3-8b --accelerator acc --mbs 1 --fsdp 8 --max_seq_length 2048
+
+# ========= seq_len=8192 mbs=1 spmd-fsdp=8 =========
+XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=./hlo/llama-3-8b-bs1-spmd-2409041135 --xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner" \
+XLA_USE_SPMD=1 ./examples/run.sh --model ./hf_models/config/llama-3-8b --accelerator acc --mbs 1 --fsdp 8 --spmd_fsdp --max_seq_length 8192
+
+# ========= seq_len=8192 mbs=1 spmd-fsdp=8 profile =========
+XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=./hlo/llama-3-8b-bs2-best --xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner --xla_gpu_memory_limit_slop_factor=95" \
+XLA_USE_SPMD=1 ./examples/run.sh --model ./hf_models/config/llama-3-8b --accelerator acc --mbs 2 --fsdp 8 --spmd_fsdp --max_seq_length 8192 --gc --gc_cnt 1 --profile
+
+# ========= seq_len=8192 mbs=2 spmd-fsdp=8 =========
+XLA_FLAGS="--xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner --xla_gpu_memory_limit_slop_factor=97" \
+XLA_USE_SPMD=1 ./examples/run.sh --model ./hf_models/config/llama-3-8b --accelerator acc --mbs 2 --fsdp 8 --spmd_fsdp --max_seq_length 8192 --gc --gc_cnt 1 # OPTIMAL
+
+# ========= seq_len=8192 mbs=2 python-fsdp=8 =========
+./examples/run.sh --model ./hf_models/config/llama-3-8b --accelerator acc --mbs 2 --fsdp 8 --max_seq_length 8192 --gc --gc_cnt 9

From 6304cd715392bb79262aaa85c0f47d6e1f284eaa Mon Sep 17 00:00:00 2001
From: luzhan <luzhan.lz@alibaba-inc.com>
Date: Fri, 13 Sep 2024 11:30:37 +0800
Subject: [PATCH 2/3] prof: add llama3-70b test script

---
 examples/llama3-70b.sh | 11 +++++++++++
 examples/llama3-8b.sh  |  5 -----
 2 files changed, 11 insertions(+), 5 deletions(-)
 create mode 100644 examples/llama3-70b.sh

diff --git a/examples/llama3-70b.sh b/examples/llama3-70b.sh
new file mode 100644
index 0000000..f893b19
--- /dev/null
+++ b/examples/llama3-70b.sh
@@ -0,0 +1,11 @@
+# ========= seq_len=8192 mbs=1 python-fsdp=32 =========
+LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.95 XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=./hlo/llama3-70b-bs1-pythonfsdp-2409041559 --xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner --xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=4294967296" \
+./examples/run.sh --model ./hf_models/config/llama-3-70b --accelerator acc --mbs 1 --fsdp 32 --max_seq_length 8192 --gc --gc_cnt 80
+
+# ========= seq_len=8192 mbs=1 spmd-fsdp=32 =========
+LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.95 XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=./hlo/llama3-70b-bs1-spmd-2409041559 --xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner --xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=4294967296" \
+XLA_USE_SPMD=1 ./examples/run.sh --model ./hf_models/config/llama-3-70b --accelerator acc --mbs 1 --fsdp 32 --spmd_fsdp --max_seq_length 8192 --gc --gc_cnt 80
+
+# ========= seq_len=8192 mbs=2 python-fsdp=32 =========
+LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.90 XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=./hlo/llama3-70b-bs2-pythonfsdp-2409061648 --xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=8589934592" \
+./examples/run.sh --model ./hf_models/config/llama-3-70b --accelerator acc --mbs 2 --fsdp 32 --max_seq_length 8192 --gc --gc_cnt 80 # OPTIMAL
\ No newline at end of file
diff --git a/examples/llama3-8b.sh b/examples/llama3-8b.sh
index 60f9d21..df6425d 100644
--- a/examples/llama3-8b.sh
+++ b/examples/llama3-8b.sh
@@ -1,8 +1,3 @@
-#!/bin/bash
-# [Note]: Commands in this script should be run under FlashModels Folder
-# bash ../scripts/launch-training-torchacc.sh
-
-
 # ========= seq_len=2048 mbs=1 python-fsdp=8 =========
 ./examples/run.sh --model ./hf_models/config/llama-3-8b --accelerator acc --mbs 1 --fsdp 8 --max_seq_length 2048
 

From 920aa7a623fb69142ace2aed9312d7741a6b44dd Mon Sep 17 00:00:00 2001
From: luzhan <luzhan.lz@alibaba-inc.com>
Date: Mon, 23 Sep 2024 10:55:19 +0800
Subject: [PATCH 3/3] prof: remove dump hlo flag and add llama3-70b spmd-fsdp
 mbs=2

---
 examples/llama3-70b.sh | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/examples/llama3-70b.sh b/examples/llama3-70b.sh
index f893b19..8b6de9c 100644
--- a/examples/llama3-70b.sh
+++ b/examples/llama3-70b.sh
@@ -1,11 +1,15 @@
 # ========= seq_len=8192 mbs=1 python-fsdp=32 =========
-LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.95 XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=./hlo/llama3-70b-bs1-pythonfsdp-2409041559 --xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner --xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=4294967296" \
+LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.95 XLA_FLAGS="--xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner --xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=4294967296" \
 ./examples/run.sh --model ./hf_models/config/llama-3-70b --accelerator acc --mbs 1 --fsdp 32 --max_seq_length 8192 --gc --gc_cnt 80
 
 # ========= seq_len=8192 mbs=1 spmd-fsdp=32 =========
-LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.95 XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=./hlo/llama3-70b-bs1-spmd-2409041559 --xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner --xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=4294967296" \
+LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.95 XLA_FLAGS="--xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner --xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=4294967296" \
 XLA_USE_SPMD=1 ./examples/run.sh --model ./hf_models/config/llama-3-70b --accelerator acc --mbs 1 --fsdp 32 --spmd_fsdp --max_seq_length 8192 --gc --gc_cnt 80
 
 # ========= seq_len=8192 mbs=2 python-fsdp=32 =========
-LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.90 XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=./hlo/llama3-70b-bs2-pythonfsdp-2409061648 --xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=8589934592" \
-./examples/run.sh --model ./hf_models/config/llama-3-70b --accelerator acc --mbs 2 --fsdp 32 --max_seq_length 8192 --gc --gc_cnt 80 # OPTIMAL
\ No newline at end of file
+LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.90 XLA_FLAGS="--xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=8589934592" \
+./examples/run.sh --model ./hf_models/config/llama-3-70b --accelerator acc --mbs 2 --fsdp 32 --max_seq_length 8192 --gc --gc_cnt 80
+
+# ========= seq_len=8192 mbs=2 spmd-fsdp=32 =========
+LOW_CPU_MEM_USAGE=1 PJRT_ALLOCATOR_FRACTION=0.95 XLA_FLAGS="--xla_disable_hlo_passes=gpu-convert-async-collectives-to-sync,triton-autotuner --xla_gpu_memory_limit_slop_factor=100 --xla_multiheap_size_constraint_per_heap=4294967296" \
+XLA_USE_SPMD=1 ./examples/run.sh --model ./hf_models/config/llama-3-70b --accelerator acc --mbs 2 --fsdp 32 --spmd_fsdp --max_seq_length 8192 --gc --gc_cnt 80 # OPTIMAL
\ No newline at end of file