Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion mlperf_logging/benchmark_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
# TODO: Update with official values
'llama31_8b': 10,
'flux1': 10,
'gpt_oss_20b': 10,
},

'hpc' : {
Expand Down Expand Up @@ -161,7 +162,8 @@
'dlrm_dcnv2',
'flux1',
'llama2_70b_lora',
'llama31_405b'
'llama31_405b',
'gpt_oss_20b'
]
},

Expand Down
2 changes: 2 additions & 0 deletions mlperf_logging/compliance_checker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,13 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_
6.0.0/closed_dlrm_dcnv2.yaml
6.0.0/closed_llama2_70b_lora.yaml
6.0.0/closed_flux1.yaml
6.0.0/closed_gpt_oss_20b.yaml
6.0.0/open_llama31_8b.yaml
6.0.0/open_llama31_405b.yaml
6.0.0/open_dlrm_dcnv2.yaml
6.0.0/open_llama2_70b_lora.yaml
6.0.0/open_flux1.yaml
6.0.0/open_gpt_oss_20b.yaml

### Existing config files for HPC submissions

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
- KEY:
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b'] "
CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'gpt_oss_20b'] "
POST: " enqueue_config('training_6.0.0/closed_{}.yaml'.format(v['value'])) "

- KEY:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
- KEY:
NAME: global_batch_size
REQ: EXACTLY_ONE
POST: >
s['global_batch_size'] = v['value']

- KEY:
NAME: max_sequence_length
REQ: EXACTLY_ONE
CHECK: " v['value'] == 8192 "

- KEY:
NAME: opt_name
REQ: EXACTLY_ONE
CHECK: " v['value'] == 'adamw' "

- KEY:
NAME: opt_base_learning_rate
REQ: EXACTLY_ONE

- KEY:
NAME: opt_end_learning_rate
REQ: EXACTLY_ONE

- KEY:
NAME: opt_learning_rate_warmup_steps
REQ: EXACTLY_ONE
POST: >
s['opt_learning_rate_warmup_steps'] = v['value']

- KEY:
NAME: opt_learning_rate_decay_steps
REQ: EXACTLY_ONE
CHECK: " v['value'] == 1_200_000 - s['opt_learning_rate_warmup_steps'] "

- KEY:
NAME: opt_learning_rate_decay_schedule
REQ: EXACTLY_ONE
CHECK: " v['value'] == 'cosine with linear warmup' "

- KEY:
NAME: opt_adamw_beta_1
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0.9 "

- KEY:
NAME: opt_adamw_beta_2
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0.95 "

- KEY:
NAME: opt_adamw_epsilon
REQ: EXACTLY_ONE
CHECK: " v['value'] == 1e-05 "

- KEY:
NAME: opt_adamw_weight_decay
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0.1 "

- KEY:
NAME: opt_gradient_clip_norm
REQ: EXACTLY_ONE
CHECK: " v['value'] == 1.0 "

- KEY:
NAME: gradient_accumulation_steps
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0 "

- KEY:
NAME: eval_samples
REQ: EXACTLY_ONE
CHECK: " v['value'] == 1024 "

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'samples_count' in v['metadata']"
ATLEAST_ONE_CHECK: "(v['value'] <= 3.34) and v['value'] > 0.0"

- KEY:
NAME: max_steps
REQ: EXACTLY_ONE
CHECK: " v['value'] == 1200000 "
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
- KEY:
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b'] "
CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'gpt_oss_20b'] "
POST: " enqueue_config('training_6.0.0/open_{}.yaml'.format(v['value'])) "
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "(v['value'] <= 3.34) and v['value'] > 0.0"
3 changes: 2 additions & 1 deletion mlperf_logging/rcp_checker/rcp_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def is_version_at_least(version, min_version):
'flux1': 10,
'llama31_405b': 3,
'llama31_8b': 10,
'gpt_oss_20b': 10,
},
"hpc": {
'cosmoflow': 10,
Expand Down Expand Up @@ -109,7 +110,7 @@ def read_submission_file(result_file, ruleset, use_train_samples):
eval_metric = json.loads(eval_accuracy_str)["metadata"]["metric"]
eval_score = json.loads(eval_accuracy_str)["value"]
stable_diffusion_eval_results[eval_step][eval_metric] = eval_score
elif benchmark in {"llama2_70b_lora", "flux1", "llama31_405b", "llama31_8b"} and ("eval_error" in str or "eval_accuracy" in str):
elif benchmark in {"llama2_70b_lora", "flux1", "llama31_405b", "llama31_8b", "gpt_oss_20b"} and ("eval_error" in str or "eval_accuracy" in str):
eval_accuracy_str = str
conv_epoch = json.loads(eval_accuracy_str)["metadata"]["samples_count"]
eval_score = json.loads(eval_accuracy_str)["value"]
Expand Down
65 changes: 65 additions & 0 deletions mlperf_logging/rcp_checker/training_6.0.0/rcps_gpt_oss_20b.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
{
"gpt_oss_20b_ref_16":
{
"Benchmark": "gpt_oss_20b",
"Creator": "AMD",
"When": "Reference RCPs before 6.0 submission",
"Platform": "1xMI355X",
"Precision": "BF16",
"BS": 16,
"Hyperparams": {
"opt_base_learning_rate": 4e-04,
"opt_learning_rate_warmup_samples": 2048,
"gradient_accumulation_steps": 1
},
"Epochs to converge": [
// 159744, 159744, 159744, 159744, 159744,
// 159744, 172032, 159744, 172032, 159744,
// 172032, 159744, 159744, 159744, 159744,
// 159744, 159744, 159744, 159744, 159744
]
},

"gpt_oss_20b_ref_32":
{
"Benchmark": "gpt_oss_20b",
"Creator": "AMD",
"When": "Reference RCPs before 6.0 submission",
"Platform": "1xMI355X",
"Precision": "BF16",
"BS": 32,
"Hyperparams": {
"opt_base_learning_rate": 8e-04,
"opt_learning_rate_warmup_samples": 2048,
"gradient_accumulation_steps": 1
},
"Epochs to converge": [
// 196608, 172032, 184320, 184320, 172032,
// 172032, 184320, 184320, 184320, 172032,
// 172032, 172032, 184320, 184320, 184320,
// 172032, 172032, 172032, 184320, 184320
]
},

"gpt_oss_20b_ref_64":
{
"Benchmark": "gpt_oss_20b",
"Creator": "AMD",
"When": "Reference RCPs before 6.0 submission",
"Platform": "1xMI355X",
"Precision": "BF16",
"BS": 64,
"Hyperparams": {
"opt_base_learning_rate": 8e-04,
"opt_learning_rate_warmup_samples": 2048,
"gradient_accumulation_steps": 2
},
"Epochs to converge": [
// 233472, 208896, 208896, 233472, 233472,
// 233472, 233472, 233472, 208896, 233472,
// 233472, 233472, 245760, 221184, 208896,
// 233472, 233472, 221184, 221184, 221184
]
}
}

1 change: 1 addition & 0 deletions mlperf_logging/result_summarizer/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ columns:
llama2_70b_lora: ["Benchmark results (minutes)", "LLM-Finetune", "SCROLSS Gov Report", "LLama2-70B-LoRA"]
llama31_8b: ["Benchmark results (minutes)", "Small LLM", "C4", "Llama31-8b"]
llama31_405b: ["Benchmark results (minutes)", "LLM", "C4", "Llama31-405B"]
gpt_oss_20b: ["Benchmark results (minutes)", "LLM", "C4", "GPT-OSS-20B"]
default: [" ", " ", " "]

hpc:
Expand Down