diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py index 593c91d7..a3ad19f5 100644 --- a/mlperf_logging/benchmark_meta.py +++ b/mlperf_logging/benchmark_meta.py @@ -23,6 +23,7 @@ # TODO: Update with official values 'llama31_8b': 10, 'flux1': 10, + 'deepseekv3_671b': 3, }, 'hpc' : { @@ -158,11 +159,12 @@ ], '6.0': [ 'llama31_8b', - 'dlrm_dcnv2', + 'dlrm_dcnv2', 'flux1', 'llama2_70b_lora', - 'llama31_405b' - ] + 'llama31_405b', + 'deepseekv3_671b' + ] }, 'hpc': { diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml index c17d1432..508e12cb 100755 --- a/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml @@ -2,7 +2,7 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b'] " + CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'deepseekv3_671b'] " POST: " enqueue_config('training_6.0.0/closed_{}.yaml'.format(v['value'])) " - KEY: diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml new file mode 100644 index 00000000..915d4fb3 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml @@ -0,0 +1,87 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 15360 " + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 4096 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " abs(v['value'] - 2.4e-05 * (s['global_batch_size'] / 16384) ** 0.5) < 1e-9 " + +- KEY: + NAME: max_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 12000 " + POST: > + s['max_steps'] = v['value'] + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 4 " + POST: > + s['opt_learning_rate_warmup_steps'] = v['value'] + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == s['max_steps'] - s['opt_learning_rate_warmup_steps'] " + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-08 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.1 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1024 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 3.6) and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml index ab82d076..7a51a15a 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml @@ -2,5 +2,5 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b'] " + CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'deepseekv3_671b'] " POST: " enqueue_config('training_6.0.0/open_{}.yaml'.format(v['value'])) " diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml new file mode 100644 index 00000000..d6de5fa5 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseekv3_671b.yaml @@ -0,0 +1,74 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 15360 " + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 4096 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: max_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1024 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 3.6) and v['value'] > 0.0" + diff --git a/mlperf_logging/mllog/constants.py b/mlperf_logging/mllog/constants.py index 57972a6f..002a2a1d 100644 --- a/mlperf_logging/mllog/constants.py +++ b/mlperf_logging/mllog/constants.py @@ -57,6 +57,7 @@ LLAMA31_405B = "llama31_405b" LLAMA31_8B = "llama31_8b" FLUX1 = "flux1" +DEEPSEEKV3_671B = "deepseekv3_671b" # Constant values - model info ADAGRAD = "adagrad" diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index cdb383f7..8838ffba 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -60,6 +60,7 @@ def is_version_at_least(version, min_version): 'flux1': 10, 'llama31_405b': 3, 'llama31_8b': 10, + 'deepseekv3_671b': 3, }, "hpc": { 'cosmoflow': 10, @@ -109,7 +110,7 @@ def read_submission_file(result_file, ruleset, use_train_samples): eval_metric = json.loads(eval_accuracy_str)["metadata"]["metric"] eval_score = json.loads(eval_accuracy_str)["value"] stable_diffusion_eval_results[eval_step][eval_metric] = eval_score - elif benchmark in {"llama2_70b_lora", "flux1", "llama31_405b", "llama31_8b"} and ("eval_error" in str or "eval_accuracy" in str): + elif benchmark in {"llama2_70b_lora", "flux1", "llama31_405b", "llama31_8b", "deepseekv3_671b"} and ("eval_error" in str or "eval_accuracy" in str): eval_accuracy_str = str conv_epoch = json.loads(eval_accuracy_str)["metadata"]["samples_count"] eval_score = json.loads(eval_accuracy_str)["value"] diff --git a/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json b/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json new file mode 100644 index 00000000..379288b0 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.0.0/rcps_deepseekv3_671b.json @@ -0,0 +1,58 @@ +{ + "deepseekv3_671b_ref_15360": + { + "Benchmark": "deepseekv3_671b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 6.0 submission", + "Platform": "64 NVIDIA GB300 nodes", + "Precision": "BF16", + "BS": 15360, + "Hyperparams": { + "opt_base_learning_rate": 0.000023238, + "opt_learning_rate_warmup_steps": 4, + "max_steps": 12000, + "gradient_accumulation_steps": 240 + }, + "Epochs to converge": [ + // TODO(dfridman) + ] + }, + + "deepseekv3_671b_ref_16384": + { + "Benchmark": "deepseekv3_671b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 6.0 submission", + "Platform": "64 NVIDIA GB300 nodes", + "Precision": "BF16", + "BS": 16384, + "Hyperparams": { + "opt_base_learning_rate": 0.000024, + "opt_learning_rate_warmup_steps": 4, + "max_steps": 12000, + "gradient_accumulation_steps": 256 + }, + "Epochs to converge": [ + // TODO(dfridman) + ] + }, + + "deepseekv3_671b_ref_18432": + { + "Benchmark": "deepseekv3_671b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 6.0 submission", + "Platform": "64 NVIDIA GB300 nodes", + "Precision": "BF16", + "BS": 18432, + "Hyperparams": { + "opt_base_learning_rate": 0.000025456, + "opt_learning_rate_warmup_steps": 4, + "max_steps": 12000, + "gradient_accumulation_steps": 288 + }, + "Epochs to converge": [ + // TODO(dfridman) + ] + } + } diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml index 111921f0..55d00bf9 100644 --- a/mlperf_logging/result_summarizer/config.yaml +++ b/mlperf_logging/result_summarizer/config.yaml @@ -103,11 +103,12 @@ columns: llama31_405b: ["Benchmark results (minutes)", "LLM", "C4", "Llama31-405B"] default: [" ", " ", " "] "6.0.0": - dlrm_dcnv2: ["Benchmark results (minutes)", "Recommendation", "1TB Multihot Clickthrough", "DLRM DCNv2"] + dlrm_dcnv2: ["Benchmark results (minutes)", "Recommendation", "1TB Multihot Clickthrough", "DLRM DCNv2"] flux1: ["Benchmark results (minutes)", "Text to image", "CC12M and Coco-2014 for eval", "Flux1"] llama2_70b_lora: ["Benchmark results (minutes)", "LLM-Finetune", "SCROLSS Gov Report", "LLama2-70B-LoRA"] llama31_8b: ["Benchmark results (minutes)", "Small LLM", "C4", "Llama31-8b"] llama31_405b: ["Benchmark results (minutes)", "LLM", "C4", "Llama31-405B"] + deepseekv3_671b: ["Benchmark results (minutes)", "LLM", "C4", "DeepSeekV3-671B"] default: [" ", " ", " "] hpc: