From 91e50bfd4d9fae455cbe37b1770963c8a1ec8bb3 Mon Sep 17 00:00:00 2001 From: Su Ann Chong Date: Mon, 23 Feb 2026 09:00:03 -0600 Subject: [PATCH] update logging for gpt-oss-20b --- mlperf_logging/benchmark_meta.py | 4 +- mlperf_logging/compliance_checker/README.md | 2 + .../training_6.0.0/closed_common.yaml | 2 +- .../training_6.0.0/closed_gpt_oss_20b.yaml | 86 +++++++++++++++++++ .../training_6.0.0/open_common.yaml | 2 +- .../training_6.0.0/open_gpt_oss_20b.yaml | 6 ++ mlperf_logging/rcp_checker/rcp_checker.py | 3 +- .../training_6.0.0/rcps_gpt_oss_20b.json | 65 ++++++++++++++ mlperf_logging/result_summarizer/config.yaml | 1 + 9 files changed, 167 insertions(+), 4 deletions(-) create mode 100644 mlperf_logging/compliance_checker/training_6.0.0/closed_gpt_oss_20b.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.0.0/open_gpt_oss_20b.yaml create mode 100644 mlperf_logging/rcp_checker/training_6.0.0/rcps_gpt_oss_20b.json diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py index 593c91d7..279130a1 100644 --- a/mlperf_logging/benchmark_meta.py +++ b/mlperf_logging/benchmark_meta.py @@ -23,6 +23,7 @@ # TODO: Update with official values 'llama31_8b': 10, 'flux1': 10, + 'gpt_oss_20b': 10, }, 'hpc' : { @@ -161,7 +162,8 @@ 'dlrm_dcnv2', 'flux1', 'llama2_70b_lora', - 'llama31_405b' + 'llama31_405b', + 'gpt_oss_20b' ] }, diff --git a/mlperf_logging/compliance_checker/README.md b/mlperf_logging/compliance_checker/README.md index df66f132..19166ef3 100644 --- a/mlperf_logging/compliance_checker/README.md +++ b/mlperf_logging/compliance_checker/README.md @@ -30,11 +30,13 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_ 6.0.0/closed_dlrm_dcnv2.yaml 6.0.0/closed_llama2_70b_lora.yaml 6.0.0/closed_flux1.yaml + 6.0.0/closed_gpt_oss_20b.yaml 6.0.0/open_llama31_8b.yaml 6.0.0/open_llama31_405b.yaml 6.0.0/open_dlrm_dcnv2.yaml 6.0.0/open_llama2_70b_lora.yaml 6.0.0/open_flux1.yaml + 6.0.0/open_gpt_oss_20b.yaml ### Existing config files for HPC submissions diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml index c17d1432..b20fe5ce 100755 --- a/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml @@ -2,7 +2,7 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b'] " + CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'gpt_oss_20b'] " POST: " enqueue_config('training_6.0.0/closed_{}.yaml'.format(v['value'])) " - KEY: diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_gpt_oss_20b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_gpt_oss_20b.yaml new file mode 100644 index 00000000..25ce7606 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_gpt_oss_20b.yaml @@ -0,0 +1,86 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 8192 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + POST: > + s['opt_learning_rate_warmup_steps'] = v['value'] + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1_200_000 - s['opt_learning_rate_warmup_steps'] " + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-05 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.1 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1024 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 3.34) and v['value'] > 0.0" + +- KEY: + NAME: max_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1200000 " \ No newline at end of file diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml index ab82d076..87fff626 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml @@ -2,5 +2,5 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b'] " + CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'gpt_oss_20b'] " POST: " enqueue_config('training_6.0.0/open_{}.yaml'.format(v['value'])) " diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_gpt_oss_20b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_gpt_oss_20b.yaml new file mode 100644 index 00000000..0e55abc4 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_gpt_oss_20b.yaml @@ -0,0 +1,6 @@ +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 3.34) and v['value'] > 0.0" \ No newline at end of file diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index cdb383f7..f65583bd 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -60,6 +60,7 @@ def is_version_at_least(version, min_version): 'flux1': 10, 'llama31_405b': 3, 'llama31_8b': 10, + 'gpt_oss_20b': 10, }, "hpc": { 'cosmoflow': 10, @@ -109,7 +110,7 @@ def read_submission_file(result_file, ruleset, use_train_samples): eval_metric = json.loads(eval_accuracy_str)["metadata"]["metric"] eval_score = json.loads(eval_accuracy_str)["value"] stable_diffusion_eval_results[eval_step][eval_metric] = eval_score - elif benchmark in {"llama2_70b_lora", "flux1", "llama31_405b", "llama31_8b"} and ("eval_error" in str or "eval_accuracy" in str): + elif benchmark in {"llama2_70b_lora", "flux1", "llama31_405b", "llama31_8b", "gpt_oss_20b"} and ("eval_error" in str or "eval_accuracy" in str): eval_accuracy_str = str conv_epoch = json.loads(eval_accuracy_str)["metadata"]["samples_count"] eval_score = json.loads(eval_accuracy_str)["value"] diff --git a/mlperf_logging/rcp_checker/training_6.0.0/rcps_gpt_oss_20b.json b/mlperf_logging/rcp_checker/training_6.0.0/rcps_gpt_oss_20b.json new file mode 100644 index 00000000..3ef6a165 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.0.0/rcps_gpt_oss_20b.json @@ -0,0 +1,65 @@ +{ + "gpt_oss_20b_ref_16": + { + "Benchmark": "gpt_oss_20b", + "Creator": "AMD", + "When": "Reference RCPs before 6.0 submission", + "Platform": "1xMI355X", + "Precision": "BF16", + "BS": 16, + "Hyperparams": { + "opt_base_learning_rate": 4e-04, + "opt_learning_rate_warmup_samples": 2048, + "gradient_accumulation_steps": 1 + }, + "Epochs to converge": [ + // 159744, 159744, 159744, 159744, 159744, + // 159744, 172032, 159744, 172032, 159744, + // 172032, 159744, 159744, 159744, 159744, + // 159744, 159744, 159744, 159744, 159744 + ] + }, + + "gpt_oss_20b_ref_32": + { + "Benchmark": "gpt_oss_20b", + "Creator": "AMD", + "When": "Reference RCPs before 6.0 submission", + "Platform": "1xMI355X", + "Precision": "BF16", + "BS": 32, + "Hyperparams": { + "opt_base_learning_rate": 8e-04, + "opt_learning_rate_warmup_samples": 2048, + "gradient_accumulation_steps": 1 + }, + "Epochs to converge": [ + // 196608, 172032, 184320, 184320, 172032, + // 172032, 184320, 184320, 184320, 172032, + // 172032, 172032, 184320, 184320, 184320, + // 172032, 172032, 172032, 184320, 184320 + ] + }, + + "gpt_oss_20b_ref_64": + { + "Benchmark": "gpt_oss_20b", + "Creator": "AMD", + "When": "Reference RCPs before 6.0 submission", + "Platform": "1xMI355X", + "Precision": "BF16", + "BS": 64, + "Hyperparams": { + "opt_base_learning_rate": 8e-04, + "opt_learning_rate_warmup_samples": 2048, + "gradient_accumulation_steps": 2 + }, + "Epochs to converge": [ + // 233472, 208896, 208896, 233472, 233472, + // 233472, 233472, 233472, 208896, 233472, + // 233472, 233472, 245760, 221184, 208896, + // 233472, 233472, 221184, 221184, 221184 + ] + } + } + \ No newline at end of file diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml index 111921f0..d6c60faa 100644 --- a/mlperf_logging/result_summarizer/config.yaml +++ b/mlperf_logging/result_summarizer/config.yaml @@ -108,6 +108,7 @@ columns: llama2_70b_lora: ["Benchmark results (minutes)", "LLM-Finetune", "SCROLSS Gov Report", "LLama2-70B-LoRA"] llama31_8b: ["Benchmark results (minutes)", "Small LLM", "C4", "Llama31-8b"] llama31_405b: ["Benchmark results (minutes)", "LLM", "C4", "Llama31-405B"] + gpt_oss_20b: ["Benchmark results (minutes)", "LLM", "C4", "GPT-OSS-20B"] default: [" ", " ", " "] hpc: