From f1df5f8c2057f714730c1b27823bf8e1da07d637 Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Tue, 2 Dec 2025 12:17:13 +0100 Subject: [PATCH 01/28] add deepseek constant --- mlperf_logging/mllog/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mlperf_logging/mllog/constants.py b/mlperf_logging/mllog/constants.py index 57972a6f..758b94d5 100644 --- a/mlperf_logging/mllog/constants.py +++ b/mlperf_logging/mllog/constants.py @@ -57,6 +57,7 @@ LLAMA31_405B = "llama31_405b" LLAMA31_8B = "llama31_8b" FLUX1 = "flux1" +DEEPSEEK_V3 = "deepseek_v3" # Constant values - model info ADAGRAD = "adagrad" From d2f27fef0500d45d7996f75f0ed4b213a350247c Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Tue, 2 Dec 2025 15:28:00 +0100 Subject: [PATCH 02/28] add deepseek to compliance check --- .../training_6.0.0/closed_bert.yaml | 48 ++++++++++++ .../training_6.0.0/closed_common.yaml | 2 +- .../training_6.0.0/closed_deepseek_v3.yaml | 74 +++++++++++++++++++ .../training_6.0.0/closed_retinanet.yaml | 35 +++++++++ .../training_6.0.0/closed_rgat.yaml | 21 ++++++ .../training_6.0.0/open_bert.yaml | 7 ++ .../training_6.0.0/open_common.yaml | 2 +- .../training_6.0.0/open_deepseek_v3.yaml | 65 ++++++++++++++++ .../training_6.0.0/open_retinanet.yaml | 7 ++ .../training_6.0.0/open_rgat.yaml | 7 ++ 10 files changed, 266 insertions(+), 2 deletions(-) create mode 100644 mlperf_logging/compliance_checker/training_6.0.0/closed_bert.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.0.0/closed_deepseek_v3.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.0.0/closed_retinanet.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.0.0/closed_rgat.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.0.0/open_bert.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.0.0/open_deepseek_v3.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.0.0/open_retinanet.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.0.0/open_rgat.yaml diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_bert.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_bert.yaml new file mode 100644 index 00000000..408f669b --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_bert.yaml @@ -0,0 +1,48 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_epsilon + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_training_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: num_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: start_warmup_step + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_beta_1 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_beta_2 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_weight_decay_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] >= 0.720) and v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml index c17d1432..2b11ae6e 100755 --- a/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml @@ -2,7 +2,7 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b'] " + CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'deepseek_v3'] " POST: " enqueue_config('training_6.0.0/closed_{}.yaml'.format(v['value'])) " - KEY: diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseek_v3.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseek_v3.yaml new file mode 100644 index 00000000..d71468fd --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseek_v3.yaml @@ -0,0 +1,74 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 4096 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-05 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.1 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 6144 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 4.0) and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_retinanet.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_retinanet.yaml new file mode 100644 index 00000000..794ab7ab --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_retinanet.yaml @@ -0,0 +1,35 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adam' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0" + +- KEY: + NAME: opt_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.0" + +- KEY: + NAME: opt_learning_rate_warmup_epochs + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0" + +- KEY: + NAME: opt_learning_rate_warmup_factor + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] >= 0.340 and v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_rgat.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_rgat.yaml new file mode 100644 index 00000000..2c1f7286 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_rgat.yaml @@ -0,0 +1,21 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0" + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adam' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] >= 0.72 and v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_bert.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_bert.yaml new file mode 100644 index 00000000..14c4176d --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_bert.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml index ab82d076..1f5c54a0 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml @@ -2,5 +2,5 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b'] " + CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'deepseek_v3'] " POST: " enqueue_config('training_6.0.0/open_{}.yaml'.format(v['value'])) " diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseek_v3.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseek_v3.yaml new file mode 100644 index 00000000..84512fea --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseek_v3.yaml @@ -0,0 +1,65 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 4096 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 6144 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 4.0) and v['value'] > 0.0" + diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_retinanet.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_retinanet.yaml new file mode 100644 index 00000000..14c4176d --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_retinanet.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_rgat.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_rgat.yaml new file mode 100644 index 00000000..14c4176d --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_rgat.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" From 63db84d3eeb54e346ce5d4b152d2b865965fa573 Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Fri, 6 Feb 2026 14:17:44 +0100 Subject: [PATCH 03/28] rm closed_bert.yaml --- .../training_6.0.0/closed_bert.yaml | 48 ------------------- 1 file changed, 48 deletions(-) delete mode 100644 mlperf_logging/compliance_checker/training_6.0.0/closed_bert.yaml diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_bert.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_bert.yaml deleted file mode 100644 index 408f669b..00000000 --- a/mlperf_logging/compliance_checker/training_6.0.0/closed_bert.yaml +++ /dev/null @@ -1,48 +0,0 @@ -- KEY: - NAME: global_batch_size - REQ: EXACTLY_ONE - POST: > - s['global_batch_size'] = v['value'] - -- KEY: - NAME: opt_base_learning_rate - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_lamb_epsilon - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_learning_rate_training_steps - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_learning_rate_warmup_steps - REQ: EXACTLY_ONE - -- KEY: - NAME: num_warmup_steps - REQ: EXACTLY_ONE - -- KEY: - NAME: start_warmup_step - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_lamb_beta_1 - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_lamb_beta_2 - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_lamb_weight_decay_rate - REQ: EXACTLY_ONE - -- KEY: - NAME: eval_accuracy - REQ: AT_LEAST_ONE - CHECK: - - "'epoch_num' in v['metadata']" - ATLEAST_ONE_CHECK: "(v['value'] >= 0.720) and v['value'] < 1.0" From 1de3bbe58444ddd51f94f2c87ec45abb64791b7b Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Fri, 6 Feb 2026 14:19:28 +0100 Subject: [PATCH 04/28] update deepseek values --- .../training_6.0.0/closed_deepseek_v3.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseek_v3.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseek_v3.yaml index d71468fd..07059d7e 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseek_v3.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseek_v3.yaml @@ -44,7 +44,7 @@ - KEY: NAME: opt_adamw_epsilon REQ: EXACTLY_ONE - CHECK: " v['value'] == 1e-05 " + CHECK: " v['value'] == 1e-08 " - KEY: NAME: opt_adamw_weight_decay @@ -64,11 +64,11 @@ - KEY: NAME: eval_samples REQ: EXACTLY_ONE - CHECK: " v['value'] == 6144 " + CHECK: " v['value'] == 1024 " - KEY: NAME: eval_accuracy REQ: AT_LEAST_ONE CHECK: - "'samples_count' in v['metadata']" - ATLEAST_ONE_CHECK: "(v['value'] <= 4.0) and v['value'] > 0.0" + ATLEAST_ONE_CHECK: "(v['value'] <= 2.7) and v['value'] > 0.0" # TODO(dfridman): Update this once we have the exact value From 3e965eb079bc9bc5c63b8ccf6afb0375d356febf Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Fri, 6 Feb 2026 14:19:46 +0100 Subject: [PATCH 05/28] rm closed_retinanet.yaml --- .../training_6.0.0/closed_retinanet.yaml | 35 ------------------- 1 file changed, 35 deletions(-) delete mode 100644 mlperf_logging/compliance_checker/training_6.0.0/closed_retinanet.yaml diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_retinanet.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_retinanet.yaml deleted file mode 100644 index 794ab7ab..00000000 --- a/mlperf_logging/compliance_checker/training_6.0.0/closed_retinanet.yaml +++ /dev/null @@ -1,35 +0,0 @@ -- KEY: - NAME: global_batch_size - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_name - REQ: EXACTLY_ONE - CHECK: " v['value'] == 'adam' " - -- KEY: - NAME: opt_base_learning_rate - REQ: EXACTLY_ONE - CHECK: " v['value'] >= 0.0" - -- KEY: - NAME: opt_weight_decay - REQ: EXACTLY_ONE - CHECK: " v['value'] == 0.0" - -- KEY: - NAME: opt_learning_rate_warmup_epochs - REQ: EXACTLY_ONE - CHECK: " v['value'] >= 0.0" - -- KEY: - NAME: opt_learning_rate_warmup_factor - REQ: EXACTLY_ONE - CHECK: " v['value'] >= 0.0" - -- KEY: - NAME: eval_accuracy - REQ: AT_LEAST_ONE - CHECK: - - "'epoch_num' in v['metadata']" - ATLEAST_ONE_CHECK: "v['value'] >= 0.340 and v['value'] < 1.0" From bcbef0ae621e9e0c644e3e7ca5a3ee171dd351a7 Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Fri, 6 Feb 2026 14:21:22 +0100 Subject: [PATCH 06/28] rm unused configs + update deepseek --- .../training_6.0.0/closed_rgat.yaml | 21 ------------------- .../training_6.0.0/open_bert.yaml | 7 ------- .../training_6.0.0/open_deepseek_v3.yaml | 4 ++-- .../training_6.0.0/open_retinanet.yaml | 7 ------- .../training_6.0.0/open_rgat.yaml | 7 ------- 5 files changed, 2 insertions(+), 44 deletions(-) delete mode 100644 mlperf_logging/compliance_checker/training_6.0.0/closed_rgat.yaml delete mode 100644 mlperf_logging/compliance_checker/training_6.0.0/open_bert.yaml delete mode 100644 mlperf_logging/compliance_checker/training_6.0.0/open_retinanet.yaml delete mode 100644 mlperf_logging/compliance_checker/training_6.0.0/open_rgat.yaml diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_rgat.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_rgat.yaml deleted file mode 100644 index 2c1f7286..00000000 --- a/mlperf_logging/compliance_checker/training_6.0.0/closed_rgat.yaml +++ /dev/null @@ -1,21 +0,0 @@ -- KEY: - NAME: global_batch_size - REQ: EXACTLY_ONE - CHECK: " v['value'] > 0" - -- KEY: - NAME: opt_name - REQ: EXACTLY_ONE - CHECK: " v['value'] == 'adam' " - -- KEY: - NAME: opt_base_learning_rate - REQ: EXACTLY_ONE - CHECK: " v['value'] >= 0.0" - -- KEY: - NAME: eval_accuracy - REQ: AT_LEAST_ONE - CHECK: - - "'epoch_num' in v['metadata']" - ATLEAST_ONE_CHECK: "v['value'] >= 0.72 and v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_bert.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_bert.yaml deleted file mode 100644 index 14c4176d..00000000 --- a/mlperf_logging/compliance_checker/training_6.0.0/open_bert.yaml +++ /dev/null @@ -1,7 +0,0 @@ - -- KEY: - NAME: eval_accuracy - REQ: AT_LEAST_ONE - CHECK: - - "'epoch_num' in v['metadata']" - ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseek_v3.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseek_v3.yaml index 84512fea..a9f73830 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseek_v3.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseek_v3.yaml @@ -54,12 +54,12 @@ - KEY: NAME: eval_samples REQ: EXACTLY_ONE - CHECK: " v['value'] == 6144 " + CHECK: " v['value'] == 1024 " - KEY: NAME: eval_accuracy REQ: AT_LEAST_ONE CHECK: - "'epoch_num' in v['metadata']" - ATLEAST_ONE_CHECK: "(v['value'] <= 4.0) and v['value'] > 0.0" + ATLEAST_ONE_CHECK: "(v['value'] <= 2.7) and v['value'] > 0.0" # TODO(dfridman): Update this once we have the exact value diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_retinanet.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_retinanet.yaml deleted file mode 100644 index 14c4176d..00000000 --- a/mlperf_logging/compliance_checker/training_6.0.0/open_retinanet.yaml +++ /dev/null @@ -1,7 +0,0 @@ - -- KEY: - NAME: eval_accuracy - REQ: AT_LEAST_ONE - CHECK: - - "'epoch_num' in v['metadata']" - ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_rgat.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_rgat.yaml deleted file mode 100644 index 14c4176d..00000000 --- a/mlperf_logging/compliance_checker/training_6.0.0/open_rgat.yaml +++ /dev/null @@ -1,7 +0,0 @@ - -- KEY: - NAME: eval_accuracy - REQ: AT_LEAST_ONE - CHECK: - - "'epoch_num' in v['metadata']" - ATLEAST_ONE_CHECK: "v['value'] < 1.0" From 3eca23bed1b0516738087acba620edcb597d6f2e Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Fri, 6 Feb 2026 14:25:58 +0100 Subject: [PATCH 07/28] fix deepseek name --- .../compliance_checker/training_6.0.0/closed_common.yaml | 2 +- .../{closed_deepseek_v3.yaml => closed_deepseek_v3_671b.yaml} | 0 .../compliance_checker/training_6.0.0/open_common.yaml | 2 +- .../{open_deepseek_v3.yaml => open_deepseek_v3_671b.yaml} | 0 4 files changed, 2 insertions(+), 2 deletions(-) rename mlperf_logging/compliance_checker/training_6.0.0/{closed_deepseek_v3.yaml => closed_deepseek_v3_671b.yaml} (100%) rename mlperf_logging/compliance_checker/training_6.0.0/{open_deepseek_v3.yaml => open_deepseek_v3_671b.yaml} (100%) diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml index 2b11ae6e..7b5c4b12 100755 --- a/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml @@ -2,7 +2,7 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'deepseek_v3'] " + CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'deepseek_v3_671b'] " POST: " enqueue_config('training_6.0.0/closed_{}.yaml'.format(v['value'])) " - KEY: diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseek_v3.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseek_v3_671b.yaml similarity index 100% rename from mlperf_logging/compliance_checker/training_6.0.0/closed_deepseek_v3.yaml rename to mlperf_logging/compliance_checker/training_6.0.0/closed_deepseek_v3_671b.yaml diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml index 1f5c54a0..8b0b43a6 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml @@ -2,5 +2,5 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'deepseek_v3'] " + CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'deepseek_v3_671b'] " POST: " enqueue_config('training_6.0.0/open_{}.yaml'.format(v['value'])) " diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseek_v3.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseek_v3_671b.yaml similarity index 100% rename from mlperf_logging/compliance_checker/training_6.0.0/open_deepseek_v3.yaml rename to mlperf_logging/compliance_checker/training_6.0.0/open_deepseek_v3_671b.yaml From ec1d0aad669fa6ff6c722275b3fd35f62bedc50e Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Fri, 6 Feb 2026 14:27:11 +0100 Subject: [PATCH 08/28] fix deepseek name --- mlperf_logging/mllog/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlperf_logging/mllog/constants.py b/mlperf_logging/mllog/constants.py index 758b94d5..a59a1ae2 100644 --- a/mlperf_logging/mllog/constants.py +++ b/mlperf_logging/mllog/constants.py @@ -57,7 +57,7 @@ LLAMA31_405B = "llama31_405b" LLAMA31_8B = "llama31_8b" FLUX1 = "flux1" -DEEPSEEK_V3 = "deepseek_v3" +DEEPSEEK_V3 = "deepseek_v3_671b" # Constant values - model info ADAGRAD = "adagrad" From cded0c88719e26fb288a749a97e9bc7bf8088333 Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Fri, 6 Feb 2026 15:01:32 +0100 Subject: [PATCH 09/28] +DEEPSEEK_V3 -> +DEEPSEEK_V3_671B --- mlperf_logging/mllog/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlperf_logging/mllog/constants.py b/mlperf_logging/mllog/constants.py index a59a1ae2..3016f0a3 100644 --- a/mlperf_logging/mllog/constants.py +++ b/mlperf_logging/mllog/constants.py @@ -57,7 +57,7 @@ LLAMA31_405B = "llama31_405b" LLAMA31_8B = "llama31_8b" FLUX1 = "flux1" -DEEPSEEK_V3 = "deepseek_v3_671b" +DEEPSEEK_V3_671B = "deepseek_v3_671b" # Constant values - model info ADAGRAD = "adagrad" From c528ecb8b11e226594d7c977912a9be6a99128ee Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Tue, 24 Feb 2026 11:35:19 +0100 Subject: [PATCH 10/28] add deepseek_v3_671b RCP support for training 6.0 - Add rcps_deepseek_v3_671b.json stub with BS 16384/18432/20480, learning rates, warmup steps, and gradient accumulation steps - Register deepseek_v3_671b in benchmark_meta.py (result file counts and allowed benchmarks for 6.0) - Add deepseek_v3_671b to submission_runs and eval_accuracy parsing in rcp_checker.py - Add deepseek_v3_671b entry to result_summarizer config.yaml --- mlperf_logging/benchmark_meta.py | 8 ++- mlperf_logging/rcp_checker/rcp_checker.py | 3 +- .../training_6.0.0/rcps_deepseek_v3_671b.json | 55 +++++++++++++++++++ mlperf_logging/result_summarizer/config.yaml | 3 +- 4 files changed, 64 insertions(+), 5 deletions(-) create mode 100644 mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseek_v3_671b.json diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py index 593c91d7..379f1267 100644 --- a/mlperf_logging/benchmark_meta.py +++ b/mlperf_logging/benchmark_meta.py @@ -23,6 +23,7 @@ # TODO: Update with official values 'llama31_8b': 10, 'flux1': 10, + 'deepseek_v3_671b': 10, }, 'hpc' : { @@ -158,11 +159,12 @@ ], '6.0': [ 'llama31_8b', - 'dlrm_dcnv2', + 'dlrm_dcnv2', 'flux1', 'llama2_70b_lora', - 'llama31_405b' - ] + 'llama31_405b', + 'deepseek_v3_671b' + ] }, 'hpc': { diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index cdb383f7..29ffc6aa 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -60,6 +60,7 @@ def is_version_at_least(version, min_version): 'flux1': 10, 'llama31_405b': 3, 'llama31_8b': 10, + 'deepseek_v3_671b': 10, }, "hpc": { 'cosmoflow': 10, @@ -109,7 +110,7 @@ def read_submission_file(result_file, ruleset, use_train_samples): eval_metric = json.loads(eval_accuracy_str)["metadata"]["metric"] eval_score = json.loads(eval_accuracy_str)["value"] stable_diffusion_eval_results[eval_step][eval_metric] = eval_score - elif benchmark in {"llama2_70b_lora", "flux1", "llama31_405b", "llama31_8b"} and ("eval_error" in str or "eval_accuracy" in str): + elif benchmark in {"llama2_70b_lora", "flux1", "llama31_405b", "llama31_8b", "deepseek_v3_671b"} and ("eval_error" in str or "eval_accuracy" in str): eval_accuracy_str = str conv_epoch = json.loads(eval_accuracy_str)["metadata"]["samples_count"] eval_score = json.loads(eval_accuracy_str)["value"] diff --git a/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseek_v3_671b.json b/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseek_v3_671b.json new file mode 100644 index 00000000..d56a3971 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseek_v3_671b.json @@ -0,0 +1,55 @@ +{ + "deepseek_v3_671b_ref_16384": + { + "Benchmark": "deepseek_v3_671b", + "Creator": "MLPerf", + "When": "Reference RCPs before 6.0 submission", + "Platform": "TBD", + "Precision": "BF16", + "BS": 16384, + "Hyperparams": { + "opt_base_learning_rate": 0.000024, + "opt_learning_rate_warmup_steps": 4, + "gradient_accumulation_steps": 256 + }, + "Epochs to converge": [ + // TODO(dfridman) + ] + }, + + "deepseek_v3_671b_ref_18432": + { + "Benchmark": "deepseek_v3_671b", + "Creator": "MLPerf", + "When": "Reference RCPs before 6.0 submission", + "Platform": "TBD", + "Precision": "BF16", + "BS": 18432, + "Hyperparams": { + "opt_base_learning_rate": 0.000025456, + "opt_learning_rate_warmup_steps": 4, + "gradient_accumulation_steps": 288 + }, + "Epochs to converge": [ + // TODO(dfridman) + ] + }, + + "deepseek_v3_671b_ref_20480": + { + "Benchmark": "deepseek_v3_671b", + "Creator": "MLPerf", + "When": "Reference RCPs before 6.0 submission", + "Platform": "TBD", + "Precision": "BF16", + "BS": 20480, + "Hyperparams": { + "opt_base_learning_rate": 0.000026833, + "opt_learning_rate_warmup_steps": 4, + "gradient_accumulation_steps": 320 + }, + "Epochs to converge": [ + // TODO(dfridman) + ] + } + } diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml index 111921f0..2409a748 100644 --- a/mlperf_logging/result_summarizer/config.yaml +++ b/mlperf_logging/result_summarizer/config.yaml @@ -103,11 +103,12 @@ columns: llama31_405b: ["Benchmark results (minutes)", "LLM", "C4", "Llama31-405B"] default: [" ", " ", " "] "6.0.0": - dlrm_dcnv2: ["Benchmark results (minutes)", "Recommendation", "1TB Multihot Clickthrough", "DLRM DCNv2"] + dlrm_dcnv2: ["Benchmark results (minutes)", "Recommendation", "1TB Multihot Clickthrough", "DLRM DCNv2"] flux1: ["Benchmark results (minutes)", "Text to image", "CC12M and Coco-2014 for eval", "Flux1"] llama2_70b_lora: ["Benchmark results (minutes)", "LLM-Finetune", "SCROLSS Gov Report", "LLama2-70B-LoRA"] llama31_8b: ["Benchmark results (minutes)", "Small LLM", "C4", "Llama31-8b"] llama31_405b: ["Benchmark results (minutes)", "LLM", "C4", "Llama31-405B"] + deepseek_v3_671b: ["Benchmark results (minutes)", "LLM", "C4", "DeepSeek-V3-671B"] default: [" ", " ", " "] hpc: From adff4f619e1003b03301c81b51590a52aabf4177 Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Tue, 24 Feb 2026 13:55:16 +0100 Subject: [PATCH 11/28] update deepseek rcps: set Creator to NVIDIA and Platform to GB300 --- .../training_6.0.0/rcps_deepseek_v3_671b.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseek_v3_671b.json b/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseek_v3_671b.json index d56a3971..f42f2b67 100644 --- a/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseek_v3_671b.json +++ b/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseek_v3_671b.json @@ -2,9 +2,9 @@ "deepseek_v3_671b_ref_16384": { "Benchmark": "deepseek_v3_671b", - "Creator": "MLPerf", + "Creator": "NVIDIA", "When": "Reference RCPs before 6.0 submission", - "Platform": "TBD", + "Platform": "GB300", "Precision": "BF16", "BS": 16384, "Hyperparams": { @@ -20,9 +20,9 @@ "deepseek_v3_671b_ref_18432": { "Benchmark": "deepseek_v3_671b", - "Creator": "MLPerf", + "Creator": "NVIDIA", "When": "Reference RCPs before 6.0 submission", - "Platform": "TBD", + "Platform": "GB300", "Precision": "BF16", "BS": 18432, "Hyperparams": { @@ -38,9 +38,9 @@ "deepseek_v3_671b_ref_20480": { "Benchmark": "deepseek_v3_671b", - "Creator": "MLPerf", + "Creator": "NVIDIA", "When": "Reference RCPs before 6.0 submission", - "Platform": "TBD", + "Platform": "GB300", "Precision": "BF16", "BS": 20480, "Hyperparams": { From 749650a2a56ba68af5e92bef728f40e5b14dfd2c Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Wed, 25 Feb 2026 10:19:07 +0100 Subject: [PATCH 12/28] rename deepseek_v3_671b -> deepseekv3_671b across files and filenames --- mlperf_logging/benchmark_meta.py | 4 ++-- .../training_6.0.0/closed_common.yaml | 2 +- ...seek_v3_671b.yaml => closed_deepseekv3_671b.yaml} | 0 .../training_6.0.0/open_common.yaml | 2 +- ...epseek_v3_671b.yaml => open_deepseekv3_671b.yaml} | 0 mlperf_logging/mllog/constants.py | 2 +- mlperf_logging/rcp_checker/rcp_checker.py | 4 ++-- ...epseek_v3_671b.json => rcps_deepseekv3_671b.json} | 12 ++++++------ mlperf_logging/result_summarizer/config.yaml | 2 +- 9 files changed, 14 insertions(+), 14 deletions(-) rename mlperf_logging/compliance_checker/training_6.0.0/{closed_deepseek_v3_671b.yaml => closed_deepseekv3_671b.yaml} (100%) rename mlperf_logging/compliance_checker/training_6.0.0/{open_deepseek_v3_671b.yaml => open_deepseekv3_671b.yaml} (100%) rename mlperf_logging/rcp_checker/training_6.0.0/{rcps_deepseek_v3_671b.json => rcps_deepseekv3_671b.json} (85%) diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py index 379f1267..22f5c2fa 100644 --- a/mlperf_logging/benchmark_meta.py +++ b/mlperf_logging/benchmark_meta.py @@ -23,7 +23,7 @@ # TODO: Update with official values 'llama31_8b': 10, 'flux1': 10, - 'deepseek_v3_671b': 10, + 'deepseekv3_671b': 10, }, 'hpc' : { @@ -163,7 +163,7 @@ 'flux1', 'llama2_70b_lora', 'llama31_405b', - 'deepseek_v3_671b' + 'deepseekv3_671b' ] }, diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml index 7b5c4b12..508e12cb 100755 --- a/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml @@ -2,7 +2,7 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'deepseek_v3_671b'] " + CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'deepseekv3_671b'] " POST: " enqueue_config('training_6.0.0/closed_{}.yaml'.format(v['value'])) " - KEY: diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseek_v3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml similarity index 100% rename from mlperf_logging/compliance_checker/training_6.0.0/closed_deepseek_v3_671b.yaml rename to mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml index 8b0b43a6..7a51a15a 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml @@ -2,5 +2,5 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'deepseek_v3_671b'] " + CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'deepseekv3_671b'] " POST: " enqueue_config('training_6.0.0/open_{}.yaml'.format(v['value'])) " diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseek_v3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml similarity index 100% rename from mlperf_logging/compliance_checker/training_6.0.0/open_deepseek_v3_671b.yaml rename to mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml diff --git a/mlperf_logging/mllog/constants.py b/mlperf_logging/mllog/constants.py index 3016f0a3..a7c1bd77 100644 --- a/mlperf_logging/mllog/constants.py +++ b/mlperf_logging/mllog/constants.py @@ -57,7 +57,7 @@ LLAMA31_405B = "llama31_405b" LLAMA31_8B = "llama31_8b" FLUX1 = "flux1" -DEEPSEEK_V3_671B = "deepseek_v3_671b" +DEEPSEEK_V3_671B = "deepseekv3_671b" # Constant values - model info ADAGRAD = "adagrad" diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index 29ffc6aa..0846cbbb 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -60,7 +60,7 @@ def is_version_at_least(version, min_version): 'flux1': 10, 'llama31_405b': 3, 'llama31_8b': 10, - 'deepseek_v3_671b': 10, + 'deepseekv3_671b': 10, }, "hpc": { 'cosmoflow': 10, @@ -110,7 +110,7 @@ def read_submission_file(result_file, ruleset, use_train_samples): eval_metric = json.loads(eval_accuracy_str)["metadata"]["metric"] eval_score = json.loads(eval_accuracy_str)["value"] stable_diffusion_eval_results[eval_step][eval_metric] = eval_score - elif benchmark in {"llama2_70b_lora", "flux1", "llama31_405b", "llama31_8b", "deepseek_v3_671b"} and ("eval_error" in str or "eval_accuracy" in str): + elif benchmark in {"llama2_70b_lora", "flux1", "llama31_405b", "llama31_8b", "deepseekv3_671b"} and ("eval_error" in str or "eval_accuracy" in str): eval_accuracy_str = str conv_epoch = json.loads(eval_accuracy_str)["metadata"]["samples_count"] eval_score = json.loads(eval_accuracy_str)["value"] diff --git a/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseek_v3_671b.json b/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json similarity index 85% rename from mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseek_v3_671b.json rename to mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json index f42f2b67..3a151cf4 100644 --- a/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseek_v3_671b.json +++ b/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json @@ -1,7 +1,7 @@ { - "deepseek_v3_671b_ref_16384": + "deepseekv3_671b_ref_16384": { - "Benchmark": "deepseek_v3_671b", + "Benchmark": "deepseekv3_671b", "Creator": "NVIDIA", "When": "Reference RCPs before 6.0 submission", "Platform": "GB300", @@ -17,9 +17,9 @@ ] }, - "deepseek_v3_671b_ref_18432": + "deepseekv3_671b_ref_18432": { - "Benchmark": "deepseek_v3_671b", + "Benchmark": "deepseekv3_671b", "Creator": "NVIDIA", "When": "Reference RCPs before 6.0 submission", "Platform": "GB300", @@ -35,9 +35,9 @@ ] }, - "deepseek_v3_671b_ref_20480": + "deepseekv3_671b_ref_20480": { - "Benchmark": "deepseek_v3_671b", + "Benchmark": "deepseekv3_671b", "Creator": "NVIDIA", "When": "Reference RCPs before 6.0 submission", "Platform": "GB300", diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml index 2409a748..24264f7d 100644 --- a/mlperf_logging/result_summarizer/config.yaml +++ b/mlperf_logging/result_summarizer/config.yaml @@ -108,7 +108,7 @@ columns: llama2_70b_lora: ["Benchmark results (minutes)", "LLM-Finetune", "SCROLSS Gov Report", "LLama2-70B-LoRA"] llama31_8b: ["Benchmark results (minutes)", "Small LLM", "C4", "Llama31-8b"] llama31_405b: ["Benchmark results (minutes)", "LLM", "C4", "Llama31-405B"] - deepseek_v3_671b: ["Benchmark results (minutes)", "LLM", "C4", "DeepSeek-V3-671B"] + deepseekv3_671b: ["Benchmark results (minutes)", "LLM", "C4", "DeepSeek-V3-671B"] default: [" ", " ", " "] hpc: From e8300ba72a395d98ae8841253581ad725b75e53a Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Wed, 25 Feb 2026 13:27:18 +0100 Subject: [PATCH 13/28] add opt_base_learning_rate check for deepseekv3_671b (sqrt scaling from BS 16384) --- .../training_6.0.0/closed_deepseekv3_671b.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml index 07059d7e..00b60f3e 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml @@ -17,6 +17,7 @@ - KEY: NAME: opt_base_learning_rate REQ: EXACTLY_ONE + CHECK: " abs(v['value'] - 2.4e-05 * (s['global_batch_size'] / 16384) ** 0.5) < 1e-9 " - KEY: NAME: opt_learning_rate_warmup_steps From 1dc9e8d582e8ab5a03dc49df4de46f83f43c9d9f Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Wed, 25 Feb 2026 13:36:30 +0100 Subject: [PATCH 14/28] add max_steps and decay_steps checks for deepseekv3_671b --- .../training_6.0.0/closed_deepseekv3_671b.yaml | 11 +++++++++++ .../training_6.0.0/rcps_deepseekv3_671b.json | 3 +++ 2 files changed, 14 insertions(+) diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml index 00b60f3e..e1dce257 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml @@ -19,13 +19,24 @@ REQ: EXACTLY_ONE CHECK: " abs(v['value'] - 2.4e-05 * (s['global_batch_size'] / 16384) ** 0.5) < 1e-9 " +- KEY: + NAME: max_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 12000 " + POST: > + s['max_steps'] = v['value'] + - KEY: NAME: opt_learning_rate_warmup_steps REQ: EXACTLY_ONE + CHECK: " v['value'] == 4 " + POST: > + s['opt_learning_rate_warmup_steps'] = v['value'] - KEY: NAME: opt_learning_rate_decay_steps REQ: EXACTLY_ONE + CHECK: " v['value'] == s['max_steps'] - s['opt_learning_rate_warmup_steps'] " - KEY: NAME: opt_learning_rate_decay_schedule diff --git a/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json b/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json index 3a151cf4..c93337c2 100644 --- a/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json +++ b/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json @@ -10,6 +10,7 @@ "Hyperparams": { "opt_base_learning_rate": 0.000024, "opt_learning_rate_warmup_steps": 4, + "max_steps": 12000, "gradient_accumulation_steps": 256 }, "Epochs to converge": [ @@ -28,6 +29,7 @@ "Hyperparams": { "opt_base_learning_rate": 0.000025456, "opt_learning_rate_warmup_steps": 4, + "max_steps": 12000, "gradient_accumulation_steps": 288 }, "Epochs to converge": [ @@ -46,6 +48,7 @@ "Hyperparams": { "opt_base_learning_rate": 0.000026833, "opt_learning_rate_warmup_steps": 4, + "max_steps": 12000, "gradient_accumulation_steps": 320 }, "Epochs to converge": [ From d09e52f33189ff33535f87d9c9ff3c43b1802aab Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Wed, 25 Feb 2026 13:39:45 +0100 Subject: [PATCH 15/28] apply same checks to open_deepseekv3_671b.yaml --- .../training_6.0.0/open_deepseekv3_671b.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml index a9f73830..ad18c92e 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml @@ -14,13 +14,29 @@ REQ: EXACTLY_ONE CHECK: " v['value'] == 'adamw' " +- KEY: + NAME: max_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 12000 " + POST: > + s['max_steps'] = v['value'] + - KEY: NAME: opt_base_learning_rate REQ: EXACTLY_ONE + CHECK: " abs(v['value'] - 2.4e-05 * (s['global_batch_size'] / 16384) ** 0.5) < 1e-9 " - KEY: NAME: opt_learning_rate_warmup_steps REQ: EXACTLY_ONE + CHECK: " v['value'] == 4 " + POST: > + s['opt_learning_rate_warmup_steps'] = v['value'] + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == s['max_steps'] - s['opt_learning_rate_warmup_steps'] " - KEY: NAME: opt_learning_rate_decay_schedule From 1941059db865f12f9e596ba0d922e8c7e5825d09 Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Wed, 25 Feb 2026 13:40:54 +0100 Subject: [PATCH 16/28] update deepseekv3_671b target loss to 4.05 --- .../training_6.0.0/closed_deepseekv3_671b.yaml | 2 +- .../compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml index e1dce257..021d2d35 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml @@ -83,4 +83,4 @@ REQ: AT_LEAST_ONE CHECK: - "'samples_count' in v['metadata']" - ATLEAST_ONE_CHECK: "(v['value'] <= 2.7) and v['value'] > 0.0" # TODO(dfridman): Update this once we have the exact value + ATLEAST_ONE_CHECK: "(v['value'] <= 4.05) and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml index ad18c92e..9c5cada2 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml @@ -77,5 +77,5 @@ REQ: AT_LEAST_ONE CHECK: - "'epoch_num' in v['metadata']" - ATLEAST_ONE_CHECK: "(v['value'] <= 2.7) and v['value'] > 0.0" # TODO(dfridman): Update this once we have the exact value + ATLEAST_ONE_CHECK: "(v['value'] <= 4.05) and v['value'] > 0.0" From 9ddf73ac950489500900f30bebba18211c97a1ff Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Wed, 25 Feb 2026 16:24:00 +0100 Subject: [PATCH 17/28] set deepseekv3_671b submission runs to 5 --- mlperf_logging/benchmark_meta.py | 2 +- mlperf_logging/rcp_checker/rcp_checker.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py index 22f5c2fa..5143ac4d 100644 --- a/mlperf_logging/benchmark_meta.py +++ b/mlperf_logging/benchmark_meta.py @@ -23,7 +23,7 @@ # TODO: Update with official values 'llama31_8b': 10, 'flux1': 10, - 'deepseekv3_671b': 10, + 'deepseekv3_671b': 5, }, 'hpc' : { diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index 0846cbbb..2f5b0780 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -60,7 +60,7 @@ def is_version_at_least(version, min_version): 'flux1': 10, 'llama31_405b': 3, 'llama31_8b': 10, - 'deepseekv3_671b': 10, + 'deepseekv3_671b': 5, }, "hpc": { 'cosmoflow': 10, From 3bb7acfc02cde6d530f1242f4b3f3b99939f87e4 Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Thu, 26 Feb 2026 16:28:45 +0100 Subject: [PATCH 18/28] set deepseekv3_671b submission runs to 3 --- mlperf_logging/benchmark_meta.py | 2 +- mlperf_logging/rcp_checker/rcp_checker.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py index 5143ac4d..a3ad19f5 100644 --- a/mlperf_logging/benchmark_meta.py +++ b/mlperf_logging/benchmark_meta.py @@ -23,7 +23,7 @@ # TODO: Update with official values 'llama31_8b': 10, 'flux1': 10, - 'deepseekv3_671b': 5, + 'deepseekv3_671b': 3, }, 'hpc' : { diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index 2f5b0780..8838ffba 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -60,7 +60,7 @@ def is_version_at_least(version, min_version): 'flux1': 10, 'llama31_405b': 3, 'llama31_8b': 10, - 'deepseekv3_671b': 5, + 'deepseekv3_671b': 3, }, "hpc": { 'cosmoflow': 10, From c3204bcf95089eb2ddea2eff018cb666465cac0c Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Thu, 26 Feb 2026 16:29:19 +0100 Subject: [PATCH 19/28] update deepseekv3_671b target loss to 3.6 --- .../training_6.0.0/closed_deepseekv3_671b.yaml | 2 +- .../compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml index 021d2d35..66877eb6 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml @@ -83,4 +83,4 @@ REQ: AT_LEAST_ONE CHECK: - "'samples_count' in v['metadata']" - ATLEAST_ONE_CHECK: "(v['value'] <= 4.05) and v['value'] > 0.0" + ATLEAST_ONE_CHECK: "(v['value'] <= 3.6) and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml index 9c5cada2..6bf288ea 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml @@ -77,5 +77,5 @@ REQ: AT_LEAST_ONE CHECK: - "'epoch_num' in v['metadata']" - ATLEAST_ONE_CHECK: "(v['value'] <= 4.05) and v['value'] > 0.0" + ATLEAST_ONE_CHECK: "(v['value'] <= 3.6) and v['value'] > 0.0" From b9726047384e782005c14fad373ce5b455351824 Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Thu, 26 Feb 2026 16:45:28 +0100 Subject: [PATCH 20/28] update deepseekv3_671b platform to GB300 NVL72 --- .../rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json b/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json index c93337c2..5715d261 100644 --- a/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json +++ b/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json @@ -4,7 +4,7 @@ "Benchmark": "deepseekv3_671b", "Creator": "NVIDIA", "When": "Reference RCPs before 6.0 submission", - "Platform": "GB300", + "Platform": "NVIDIA GB300 NVL72: 72x NVIDIA GB300 288GB GPUs, 36x Grace CPUs, 17TB System Memory, NVIDIA ConnectX-7", "Precision": "BF16", "BS": 16384, "Hyperparams": { @@ -23,7 +23,7 @@ "Benchmark": "deepseekv3_671b", "Creator": "NVIDIA", "When": "Reference RCPs before 6.0 submission", - "Platform": "GB300", + "Platform": "NVIDIA GB300 NVL72: 72x NVIDIA GB300 288GB GPUs, 36x Grace CPUs, 17TB System Memory, NVIDIA ConnectX-7", "Precision": "BF16", "BS": 18432, "Hyperparams": { @@ -42,7 +42,7 @@ "Benchmark": "deepseekv3_671b", "Creator": "NVIDIA", "When": "Reference RCPs before 6.0 submission", - "Platform": "GB300", + "Platform": "NVIDIA GB300 NVL72: 72x NVIDIA GB300 288GB GPUs, 36x Grace CPUs, 17TB System Memory, NVIDIA ConnectX-7", "Precision": "BF16", "BS": 20480, "Hyperparams": { From e50cf5ee80aae70fa534349fbfe80eef40153fbc Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Thu, 26 Feb 2026 16:50:52 +0100 Subject: [PATCH 21/28] update deepseekv3_671b RCPs: use BS 15360/16384/18432 --- .../training_6.0.0/rcps_deepseekv3_671b.json | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json b/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json index 5715d261..a4deeef2 100644 --- a/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json +++ b/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json @@ -1,55 +1,55 @@ { - "deepseekv3_671b_ref_16384": + "deepseekv3_671b_ref_15360": { "Benchmark": "deepseekv3_671b", "Creator": "NVIDIA", "When": "Reference RCPs before 6.0 submission", "Platform": "NVIDIA GB300 NVL72: 72x NVIDIA GB300 288GB GPUs, 36x Grace CPUs, 17TB System Memory, NVIDIA ConnectX-7", "Precision": "BF16", - "BS": 16384, + "BS": 15360, "Hyperparams": { - "opt_base_learning_rate": 0.000024, + "opt_base_learning_rate": 0.000023238, "opt_learning_rate_warmup_steps": 4, "max_steps": 12000, - "gradient_accumulation_steps": 256 + "gradient_accumulation_steps": 240 }, "Epochs to converge": [ // TODO(dfridman) ] }, - "deepseekv3_671b_ref_18432": + "deepseekv3_671b_ref_16384": { "Benchmark": "deepseekv3_671b", "Creator": "NVIDIA", "When": "Reference RCPs before 6.0 submission", "Platform": "NVIDIA GB300 NVL72: 72x NVIDIA GB300 288GB GPUs, 36x Grace CPUs, 17TB System Memory, NVIDIA ConnectX-7", "Precision": "BF16", - "BS": 18432, + "BS": 16384, "Hyperparams": { - "opt_base_learning_rate": 0.000025456, + "opt_base_learning_rate": 0.000024, "opt_learning_rate_warmup_steps": 4, "max_steps": 12000, - "gradient_accumulation_steps": 288 + "gradient_accumulation_steps": 256 }, "Epochs to converge": [ // TODO(dfridman) ] }, - "deepseekv3_671b_ref_20480": + "deepseekv3_671b_ref_18432": { "Benchmark": "deepseekv3_671b", "Creator": "NVIDIA", "When": "Reference RCPs before 6.0 submission", "Platform": "NVIDIA GB300 NVL72: 72x NVIDIA GB300 288GB GPUs, 36x Grace CPUs, 17TB System Memory, NVIDIA ConnectX-7", "Precision": "BF16", - "BS": 20480, + "BS": 18432, "Hyperparams": { - "opt_base_learning_rate": 0.000026833, + "opt_base_learning_rate": 0.000025456, "opt_learning_rate_warmup_steps": 4, "max_steps": 12000, - "gradient_accumulation_steps": 320 + "gradient_accumulation_steps": 288 }, "Epochs to converge": [ // TODO(dfridman) From 29f9b8bcbdb52030b367d60daf8ab5fc59dc0339 Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Thu, 26 Feb 2026 16:59:01 +0100 Subject: [PATCH 22/28] add global_batch_size >= 15360 check for deepseekv3_671b --- .../training_6.0.0/closed_deepseekv3_671b.yaml | 1 + .../training_6.0.0/open_deepseekv3_671b.yaml | 9 +-------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml index 66877eb6..915d4fb3 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml @@ -1,6 +1,7 @@ - KEY: NAME: global_batch_size REQ: EXACTLY_ONE + CHECK: " v['value'] >= 15360 " POST: > s['global_batch_size'] = v['value'] diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml index 6bf288ea..d6de5fa5 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml @@ -1,6 +1,7 @@ - KEY: NAME: global_batch_size REQ: EXACTLY_ONE + CHECK: " v['value'] >= 15360 " POST: > s['global_batch_size'] = v['value'] @@ -17,26 +18,18 @@ - KEY: NAME: max_steps REQ: EXACTLY_ONE - CHECK: " v['value'] == 12000 " - POST: > - s['max_steps'] = v['value'] - KEY: NAME: opt_base_learning_rate REQ: EXACTLY_ONE - CHECK: " abs(v['value'] - 2.4e-05 * (s['global_batch_size'] / 16384) ** 0.5) < 1e-9 " - KEY: NAME: opt_learning_rate_warmup_steps REQ: EXACTLY_ONE - CHECK: " v['value'] == 4 " - POST: > - s['opt_learning_rate_warmup_steps'] = v['value'] - KEY: NAME: opt_learning_rate_decay_steps REQ: EXACTLY_ONE - CHECK: " v['value'] == s['max_steps'] - s['opt_learning_rate_warmup_steps'] " - KEY: NAME: opt_learning_rate_decay_schedule From d11d808fdbfc2130fc89975c08f6fd40abbfc285 Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Thu, 26 Feb 2026 17:00:13 +0100 Subject: [PATCH 23/28] restore checks and POSTs in open_deepseekv3_671b.yaml --- .../training_6.0.0/open_deepseekv3_671b.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml index d6de5fa5..3144e66d 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml @@ -18,18 +18,26 @@ - KEY: NAME: max_steps REQ: EXACTLY_ONE + CHECK: " v['value'] == 12000 " + POST: > + s['max_steps'] = v['value'] - KEY: NAME: opt_base_learning_rate REQ: EXACTLY_ONE + CHECK: " abs(v['value'] - 2.4e-05 * (s['global_batch_size'] / 16384) ** 0.5) < 1e-9 " - KEY: NAME: opt_learning_rate_warmup_steps REQ: EXACTLY_ONE + CHECK: " v['value'] == 4 " + POST: > + s['opt_learning_rate_warmup_steps'] = v['value'] - KEY: NAME: opt_learning_rate_decay_steps REQ: EXACTLY_ONE + CHECK: " v['value'] == s['max_steps'] - s['opt_learning_rate_warmup_steps'] " - KEY: NAME: opt_learning_rate_decay_schedule From 48ca1866143731ac56c1e6999de10b2a226451ae Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Thu, 26 Feb 2026 17:01:41 +0100 Subject: [PATCH 24/28] rename DEEPSEEK_V3_671B -> DEEPSEEKV3_671B in constants.py --- mlperf_logging/mllog/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlperf_logging/mllog/constants.py b/mlperf_logging/mllog/constants.py index a7c1bd77..002a2a1d 100644 --- a/mlperf_logging/mllog/constants.py +++ b/mlperf_logging/mllog/constants.py @@ -57,7 +57,7 @@ LLAMA31_405B = "llama31_405b" LLAMA31_8B = "llama31_8b" FLUX1 = "flux1" -DEEPSEEK_V3_671B = "deepseekv3_671b" +DEEPSEEKV3_671B = "deepseekv3_671b" # Constant values - model info ADAGRAD = "adagrad" From 462b442d3292105792e4c24c71d2d529ab57fb22 Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Thu, 26 Feb 2026 17:04:02 +0100 Subject: [PATCH 25/28] remove checks from open_deepseekv3_671b.yaml --- .../training_6.0.0/open_deepseekv3_671b.yaml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml index 3144e66d..d6de5fa5 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml @@ -18,26 +18,18 @@ - KEY: NAME: max_steps REQ: EXACTLY_ONE - CHECK: " v['value'] == 12000 " - POST: > - s['max_steps'] = v['value'] - KEY: NAME: opt_base_learning_rate REQ: EXACTLY_ONE - CHECK: " abs(v['value'] - 2.4e-05 * (s['global_batch_size'] / 16384) ** 0.5) < 1e-9 " - KEY: NAME: opt_learning_rate_warmup_steps REQ: EXACTLY_ONE - CHECK: " v['value'] == 4 " - POST: > - s['opt_learning_rate_warmup_steps'] = v['value'] - KEY: NAME: opt_learning_rate_decay_steps REQ: EXACTLY_ONE - CHECK: " v['value'] == s['max_steps'] - s['opt_learning_rate_warmup_steps'] " - KEY: NAME: opt_learning_rate_decay_schedule From 5ad02fbbf3b6a7253661ffd4204ce13c53a786d3 Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Thu, 26 Feb 2026 17:05:40 +0100 Subject: [PATCH 26/28] rename DeepSeek-V3-671B -> DeepSeekV3-671B in result_summarizer config --- mlperf_logging/result_summarizer/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml index 24264f7d..55d00bf9 100644 --- a/mlperf_logging/result_summarizer/config.yaml +++ b/mlperf_logging/result_summarizer/config.yaml @@ -108,7 +108,7 @@ columns: llama2_70b_lora: ["Benchmark results (minutes)", "LLM-Finetune", "SCROLSS Gov Report", "LLama2-70B-LoRA"] llama31_8b: ["Benchmark results (minutes)", "Small LLM", "C4", "Llama31-8b"] llama31_405b: ["Benchmark results (minutes)", "LLM", "C4", "Llama31-405B"] - deepseekv3_671b: ["Benchmark results (minutes)", "LLM", "C4", "DeepSeek-V3-671B"] + deepseekv3_671b: ["Benchmark results (minutes)", "LLM", "C4", "DeepSeekV3-671B"] default: [" ", " ", " "] hpc: From 46bcffb13b44e193aaa3c294fa9e3de884c3d135 Mon Sep 17 00:00:00 2001 From: Shriya Rishab Date: Thu, 26 Feb 2026 11:11:31 -0500 Subject: [PATCH 27/28] Update platform value --- .../rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json b/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json index a4deeef2..93ec4a83 100644 --- a/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json +++ b/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json @@ -4,7 +4,7 @@ "Benchmark": "deepseekv3_671b", "Creator": "NVIDIA", "When": "Reference RCPs before 6.0 submission", - "Platform": "NVIDIA GB300 NVL72: 72x NVIDIA GB300 288GB GPUs, 36x Grace CPUs, 17TB System Memory, NVIDIA ConnectX-7", + "Platform": "64 NVIDIA GB300 NVL72", "Precision": "BF16", "BS": 15360, "Hyperparams": { @@ -23,7 +23,7 @@ "Benchmark": "deepseekv3_671b", "Creator": "NVIDIA", "When": "Reference RCPs before 6.0 submission", - "Platform": "NVIDIA GB300 NVL72: 72x NVIDIA GB300 288GB GPUs, 36x Grace CPUs, 17TB System Memory, NVIDIA ConnectX-7", + "Platform": "64 NVIDIA GB300 NVL72", "Precision": "BF16", "BS": 16384, "Hyperparams": { @@ -42,7 +42,7 @@ "Benchmark": "deepseekv3_671b", "Creator": "NVIDIA", "When": "Reference RCPs before 6.0 submission", - "Platform": "NVIDIA GB300 NVL72: 72x NVIDIA GB300 288GB GPUs, 36x Grace CPUs, 17TB System Memory, NVIDIA ConnectX-7", + "Platform": "64 NVIDIA GB300 NVL72", "Precision": "BF16", "BS": 18432, "Hyperparams": { From a46ecdb0e8543f15e2fdf33bac3fa0c30290d0fa Mon Sep 17 00:00:00 2001 From: Shriya Rishab Date: Thu, 26 Feb 2026 16:42:24 -0500 Subject: [PATCH 28/28] Update platform description in JSON file --- .../rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json b/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json index 93ec4a83..379288b0 100644 --- a/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json +++ b/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json @@ -4,7 +4,7 @@ "Benchmark": "deepseekv3_671b", "Creator": "NVIDIA", "When": "Reference RCPs before 6.0 submission", - "Platform": "64 NVIDIA GB300 NVL72", + "Platform": "64 NVIDIA GB300 nodes", "Precision": "BF16", "BS": 15360, "Hyperparams": { @@ -23,7 +23,7 @@ "Benchmark": "deepseekv3_671b", "Creator": "NVIDIA", "When": "Reference RCPs before 6.0 submission", - "Platform": "64 NVIDIA GB300 NVL72", + "Platform": "64 NVIDIA GB300 nodes", "Precision": "BF16", "BS": 16384, "Hyperparams": { @@ -42,7 +42,7 @@ "Benchmark": "deepseekv3_671b", "Creator": "NVIDIA", "When": "Reference RCPs before 6.0 submission", - "Platform": "64 NVIDIA GB300 NVL72", + "Platform": "64 NVIDIA GB300 nodes", "Precision": "BF16", "BS": 18432, "Hyperparams": {