From 362c0888422cf6f8d1487e439d8c9364cb703f8c Mon Sep 17 00:00:00 2001 From: Wang Kang Date: Fri, 5 Dec 2025 15:53:40 +0800 Subject: [PATCH 1/4] add matmul scripts --- base_test/matmul_test/READE.md | 57 ++ .../summarize_f32_f16_bf16_q8_fp8_log.py | 122 ++++ .../summarize_fp64_tf32_log.py | 120 ++++ .../exetrct_log_tools/summary_mixed_data.py | 64 ++ .../fp64_tf32_src/build_gemm_fp64.sh | 1 + .../fp64_tf32_src/build_gemm_tf32.sh | 1 + .../matmul_test/fp64_tf32_src/gemm_fp64.mu | 122 ++++ .../matmul_test/fp64_tf32_src/gemm_tf32.cpp | 678 ++++++++++++++++++ .../test_gemm_f32_f16_bf16_q8_fp8.sh | 48 ++ base_test/matmul_test/test_gemm_fp64_tf32.sh | 84 +++ base_test/matmul_test/test_gemm_mixed.sh | 86 +++ {script => base_test}/monitor/README.md | 0 {script => base_test}/monitor/monitor_gpu.sh | 0 13 files changed, 1383 insertions(+) create mode 100644 base_test/matmul_test/READE.md create mode 100644 base_test/matmul_test/exetrct_log_tools/summarize_f32_f16_bf16_q8_fp8_log.py create mode 100644 base_test/matmul_test/exetrct_log_tools/summarize_fp64_tf32_log.py create mode 100644 base_test/matmul_test/exetrct_log_tools/summary_mixed_data.py create mode 100644 base_test/matmul_test/fp64_tf32_src/build_gemm_fp64.sh create mode 100644 base_test/matmul_test/fp64_tf32_src/build_gemm_tf32.sh create mode 100644 base_test/matmul_test/fp64_tf32_src/gemm_fp64.mu create mode 100644 base_test/matmul_test/fp64_tf32_src/gemm_tf32.cpp create mode 100644 base_test/matmul_test/test_gemm_f32_f16_bf16_q8_fp8.sh create mode 100644 base_test/matmul_test/test_gemm_fp64_tf32.sh create mode 100644 base_test/matmul_test/test_gemm_mixed.sh rename {script => base_test}/monitor/README.md (100%) rename {script => base_test}/monitor/monitor_gpu.sh (100%) mode change 100755 => 100644 diff --git a/base_test/matmul_test/READE.md b/base_test/matmul_test/READE.md new file mode 100644 index 0000000..c54a072 --- /dev/null +++ b/base_test/matmul_test/READE.md @@ -0,0 +1,57 @@ +Matmul 自动化测试脚本 +# 1. 脚本说明 +matmul 存放位置: +```shell +mudnn_bench +├── bench_test_matmul.sh +├── bin +│ ├── mudnn_bench -> mudnn_bench-x.x.x +│ └── mudnn_bench-x.x.x +├── matmul_test +``` +mudnn_bench 示例: +```shell + +# 示例 1:单卡,大矩阵,f32 +MUSA_VISIBLE_DEVICES=4 ./bin/mudnn_bench -m --mm_m 6144 --mm_n 3584 --mm_k 6144 --warmup 30 --tm i --tmv 1000 -p -t f32 + +# 示例 2:多卡,标准尺寸,bf16 +MUSA_VISIBLE_DEVICES=0,1 ./bin/mudnn_bench -m --mm_m 4096 --mm_n 4096 --mm_k 4096 --warmup 30 --tm i --tmv 1000 -p -t bf16 + +# 示例 3:单卡,特殊组合,int8 +MUSA_VISIBLE_DEVICES=2 ./bin/mudnn_bench -m --mm_m 8192 --mm_n 8192 --mm_k 768 --warmup 30 --tm i --tmv 1000 -p -t int8 + +# 示例 4:使用混合精度格式 +MUSA_VISIBLE_DEVICES=3 ./bin/mudnn_bench -m --mm_m 2048 --mm_n 2048 --mm_k 2048 --warmup 30 --tm i --tmv 1000 -p -t bf16:q4:bf16:bf16 +``` + +# 2. 测试 +## 2.1 fp64, tf32 测试 +```shell +# 1. 编译 +bash ./fp64_tf32_src/build_gemm_tf32.sh + +bash ./fp64_tf32_src/build_gemm_fp64.sh + +## 2. 测试 +bash test_gemm_fp64_tf32.sh +``` + +## 2.2 f32_f16_bf16_q8_fp8 测试 +```shell +bash test_gemm_f32_f16_bf16_q8_fp8.sh +``` + +## 2.3 混合精度测试 +```shell +# A,B: fp16, C,D: f32: "f16:f16:f32:f32" +# A,B: bf16, C,D: f32: "bf16:bf16:f32:f32" +# A,B: tf32, C,D: f32: "f32" +# A,B: int8, C,D: int32: "int8" +# W8A8: "q8:q8:f32:f32" +# W4A16: "bf16:q4:bf16:bf16" +# A,B: fp8, C,D: fp16: "float8_e4m3:float8_e4m3:f16:f16" + +bash test_gemm_mixed.sh +``` + diff --git a/base_test/matmul_test/exetrct_log_tools/summarize_f32_f16_bf16_q8_fp8_log.py b/base_test/matmul_test/exetrct_log_tools/summarize_f32_f16_bf16_q8_fp8_log.py new file mode 100644 index 0000000..95a2a28 --- /dev/null +++ b/base_test/matmul_test/exetrct_log_tools/summarize_f32_f16_bf16_q8_fp8_log.py @@ -0,0 +1,122 @@ +import re +import os +import sys +from typing import List, Dict, Optional + +def extract_matmul_data(log_path: str) -> List[Dict[str, str]]: + patterns = { + "datatype": re.compile(r"DataType (\w+)"), + "mat_params": re.compile(r"m (\d+), n (\d+), k (\d+)"), + "elapsed_time": re.compile(r"AverageElapsedTime\(ms\) : (\d+\.\d+)"), + "throughput_gops": re.compile(r"Throughput (\d+\.\d+) GOPS") + } + + extracted = [] + current_block = {} + + try: + with open(log_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + + dt_match = patterns["datatype"].search(line) + if dt_match: + current_block["datatype"] = dt_match.group(1) + + mp_match = patterns["mat_params"].search(line) + if mp_match: + current_block["m"] = mp_match.group(1) + current_block["n"] = mp_match.group(2) + current_block["k"] = mp_match.group(3) + + et_match = patterns["elapsed_time"].search(line) + if et_match: + current_block["elapsed_time"] = et_match.group(1) + + tp_match = patterns["throughput_gops"].search(line) + if tp_match: + tops = round(float(tp_match.group(1)) / 1000, 4) + current_block["throughput_tops"] = str(tops) + + if line == "==============================" and current_block: + required = ["datatype", "m", "n", "k", "elapsed_time", "throughput_tops"] + if all(key in current_block for key in required): + dim = f"{current_block['m']}-{current_block['n']}-{current_block['k']}" + extracted.append({ + "datatype": current_block["datatype"], + "shape": dim, + "Throughput(TOPS)": current_block["throughput_tops"], + "AverageElapsedTime(ms)": current_block["elapsed_time"] + }) + current_block = {} + + required = ["datatype", "m", "n", "k", "elapsed_time", "throughput_tops"] + if current_block and all(key in current_block for key in required): + dim = f"{current_block['m']}×{current_block['n']}×{current_block['k']}" + extracted.append({ + "datatype": current_block["datatype"], + "shape": dim, + "Throughput(TOPS)": current_block["throughput_tops"], + "AverageElapsedTime(ms)": current_block["elapsed_time"] + }) + + except Exception as e: + print(f"❌ 读取日志失败:{str(e)}") + return [] + + return extracted + +def generate_csv(data: List[Dict[str, str]], output_path: str) -> bool: + if not data: + print("⚠️ 未提取到有效数据,跳过CSV生成") + return False + + headers = ["datatype", "shape", "Throughput(TOPS)", "AverageElapsedTime(ms)"] + + try: + with open(output_path, 'w', encoding='utf-8') as f: + f.write(", ".join(headers) + "\n") + for item in data: + row = [item[h] for h in headers] + f.write(", ".join(row) + "\n") + print(f"✅ CSV生成成功:{output_path}") + return True + except Exception as e: + print(f"❌ 生成CSV失败:{str(e)}") + return False + +def main(input_log: str, output_csv: Optional[str] = None): + if not os.path.isfile(input_log): + print(f"❌ 输入日志文件不存在:{input_log}") + return + + if not output_csv: + log_dir = os.path.dirname(input_log) + log_name = os.path.splitext(os.path.basename(input_log))[0] + output_csv = os.path.join(log_dir, f"{log_name}_summary.csv") + + print(f"📊 开始提取日志数据:{input_log}") + matmul_data = extract_matmul_data(input_log) + + if not matmul_data: + print("❌ 未提取到任何有效测试数据") + return + + print(f"✅ 成功提取 {len(matmul_data)} 条测试记录") + + generate_csv(matmul_data, output_csv) + print("🎯 所有操作完成!") + +if __name__ == "__main__": + # 修正sys.argv判断(sys.argv[0]是脚本名,需至少传入1个输入文件路径) + if len(sys.argv) < 2: + print("用法:") + print(" python summarize_fp64_tf32_log.py <输入日志文件路径>") + print("示例:") + print(" python summarize_fp64_tf32_log.py bench.log") + sys.exit(1) + + input_path = sys.argv[1] + output_path = sys.argv[1].replace('.log', '.csv') # 日志文件同名CSV输出 + main(input_path, output_path) + diff --git a/base_test/matmul_test/exetrct_log_tools/summarize_fp64_tf32_log.py b/base_test/matmul_test/exetrct_log_tools/summarize_fp64_tf32_log.py new file mode 100644 index 0000000..31dacbd --- /dev/null +++ b/base_test/matmul_test/exetrct_log_tools/summarize_fp64_tf32_log.py @@ -0,0 +1,120 @@ +import re +import sys +import os +from typing import List, Dict, Optional + +def extract_matmul_data(log_path: str) -> List[Dict[str, str]]: + patterns = { + "datatype": re.compile(r"MatMul (\w+) Test \(MUSA\)"), + "mat_params": re.compile(r"m = (\d+), n = (\d+), k = (\d+)"), + "duration_us": re.compile(r"Duration:(\s*[\d\.]+) us"), + "tflops": re.compile(r"computation-\w+=(\s*[\d\.]+)") + } + + extracted = [] + current_block = {} + + try: + with open(log_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + + dt_match = patterns["datatype"].search(line) + if dt_match: + current_block["datatype"] = dt_match.group(1) + + mp_match = patterns["mat_params"].search(line) + if mp_match: + current_block["m"] = mp_match.group(1) + current_block["n"] = mp_match.group(2) + current_block["k"] = mp_match.group(3) + + dur_match = patterns["duration_us"].search(line) + if dur_match: + us_val = float(dur_match.group(1).strip()) + ms_val = round(us_val / 1000, 6) + current_block["duration_ms"] = str(ms_val) + + tf_match = patterns["tflops"].search(line) + if tf_match: + tf_val = tf_match.group(1).strip() + current_block["tflops"] = str(round(float(tf_val), 6)) + + if line == "========================================" and current_block: + required = ["datatype", "m", "n", "k", "duration_ms", "tflops"] + if all(key in current_block for key in required): + shape = f"{current_block['m']}-{current_block['n']}-{current_block['k']}" + extracted.append({ + "DataType": current_block["datatype"], + "shape": shape, + "Compute_ability(TFLOPS)": current_block["tflops"], + "AverageElapsedTime(ms)": current_block["duration_ms"] + }) + current_block = {} + + required = ["datatype", "m", "n", "k", "duration_ms", "tflops"] + if current_block and all(key in current_block for key in required): + shape = f"{current_block['m']}-{current_block['n']}-{current_block['k']}" + extracted.append({ + "DataType": current_block["datatype"], + "shape": shape, + "Compute_ability(TFLOPS)": current_block["tflops"], + "AverageElapsedTime(ms)": current_block["duration_ms"] + }) + + except Exception as e: + print(f"❌ 读取日志失败:{str(e)}") + return [] + + return extracted + +def generate_csv(data: List[Dict[str, str]], output_path: str) -> bool: + if not data: + print("⚠️ 未提取到有效数据,跳过CSV生成") + return False + + headers = ["DataType", "shape", "Compute_ability(TFLOPS)", "AverageElapsedTime(ms)"] + try: + with open(output_path, 'w', encoding='utf-8') as f: + f.write(", ".join(headers) + "\n") + for item in data: + row = [item[h] for h in headers] + f.write(",".join(row) + "\n") + print(f"✅ CSV生成成功:{output_path}") + return True + except Exception as e: + print(f"❌ 生成CSV失败:{str(e)}") + return False + +def main(input_log: str, output_csv: Optional[str] = None): + if not os.path.isfile(input_log): + print(f"❌ 输入日志文件不存在:{input_log}") + return + + if not output_csv: + log_dir = os.path.dirname(input_log) + log_name = os.path.splitext(os.path.basename(input_log))[0] + output_csv = os.path.join(log_dir, f"{log_name}_summary.csv") + + print(f"📊 开始提取日志数据:{input_log}") + matmul_data = extract_matmul_data(input_log) + + if not matmul_data: + print("❌ 未提取到任何有效测试数据") + return + + print(f"✅ 成功提取 {len(matmul_data)} 条测试记录") + generate_csv(matmul_data, output_csv) + print("🎯 所有操作完成!") + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("用法:") + print(" python summarize_fp64_tf32_log.py <输入日志文件路径>") + print("示例:") + print(" python summarize_fp64_tf32_log.py bench.log") + sys.exit(1) + + input_path = sys.argv[1] + output_path = sys.argv[1].replace('.log', '.csv') + main(input_path, output_path) diff --git a/base_test/matmul_test/exetrct_log_tools/summary_mixed_data.py b/base_test/matmul_test/exetrct_log_tools/summary_mixed_data.py new file mode 100644 index 0000000..5edfa15 --- /dev/null +++ b/base_test/matmul_test/exetrct_log_tools/summary_mixed_data.py @@ -0,0 +1,64 @@ +import re +import sys +import csv +import os + +if len(sys.argv) < 2: + print("Usage: python summary_fix_data.py ") + sys.exit(1) + +log_file = sys.argv[1] +print(f"📊 正在读取并解析日志:{log_file}") + +if not os.path.exists(log_file): + print("❌ 日志文件不存在") + sys.exit(1) + +# 收集结果 +records = [] + +# 正则模式 +re_start = re.compile(r"测试:\s*M=(\d+),\s*N=(\d+),\s*K=(\d+),\s*Type=([\w:]+)") +re_result = re.compile(r"AverageElapsedTime\(ms\)\s*:\s*([\d\.]+)\s*,\s*Throughput\s*([\d\.]+)\s*GOPS") + +cur_M = cur_N = cur_K = cur_type = None + +with open(log_file, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + + # 匹配开始参数 + m1 = re_start.search(line) + if m1: + cur_M, cur_N, cur_K, cur_type = m1.groups() + continue + + # 匹配结果 + m2 = re_result.search(line) + if m2 and cur_M is not None: + elapsed, gops = m2.groups() + records.append({ + "M": cur_M, + "N": cur_N, + "K": cur_K, + "Type": cur_type, + "AvgTime(ms)": elapsed, + "GOPS": gops + }) + # 清空当前块(防止串行) + cur_M = cur_N = cur_K = cur_type = None + +# 输出 CSV +if not records: + print("⚠️ 未提取到任何有效数据") + sys.exit(0) + +csv_path = log_file.replace(".log", ".csv") +with open(csv_path, "w", newline="", encoding="utf-8") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=records[0].keys()) + writer.writeheader() + writer.writerows(records) + +print(f"✅ 解析完成,共 {len(records)} 条数据") +print(f"📄 CSV 已生成:{csv_path}") + diff --git a/base_test/matmul_test/fp64_tf32_src/build_gemm_fp64.sh b/base_test/matmul_test/fp64_tf32_src/build_gemm_fp64.sh new file mode 100644 index 0000000..4d33fd1 --- /dev/null +++ b/base_test/matmul_test/fp64_tf32_src/build_gemm_fp64.sh @@ -0,0 +1 @@ +mcc gemm_fp64.mu -lmusart -lmublas -o gemm_fp64 --offload-arch=mp_31 diff --git a/base_test/matmul_test/fp64_tf32_src/build_gemm_tf32.sh b/base_test/matmul_test/fp64_tf32_src/build_gemm_tf32.sh new file mode 100644 index 0000000..83b5acb --- /dev/null +++ b/base_test/matmul_test/fp64_tf32_src/build_gemm_tf32.sh @@ -0,0 +1 @@ +g++ gemm_tf32.cpp -std=c++17 -I/usr/local/musa/include -L /usr/local/musa/lib/ -fopenmp -lmudnn -lmusart -o gemm_tf32 -O2 diff --git a/base_test/matmul_test/fp64_tf32_src/gemm_fp64.mu b/base_test/matmul_test/fp64_tf32_src/gemm_fp64.mu new file mode 100644 index 0000000..ac62c9e --- /dev/null +++ b/base_test/matmul_test/fp64_tf32_src/gemm_fp64.mu @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include + +size_t M = 16384; +size_t N = 16384; +size_t K = 16384; + +struct PrecisionConfig +{ + int bytesPerElement; + const char *name; + int NUM_ITERATIONS; + int WARMUP_ITERATIONS = 10; +}; + +void test(const PrecisionConfig &config) +{ + double *d_A, *d_B, *d_C; + std::vector h_A(M * K, double(0.9f)); + std::vector h_B(K * N, double(1.2f)); + std::vector h_C(M * N); + + musaMalloc(&d_A, M * K * config.bytesPerElement); + musaMalloc(&d_B, K * N * config.bytesPerElement); + musaMalloc(&d_C, M * N * config.bytesPerElement); + + musaMemcpy(d_A, h_A.data(), M * K * config.bytesPerElement, musaMemcpyHostToDevice); + musaMemcpy(d_B, h_B.data(), K * N * config.bytesPerElement, musaMemcpyHostToDevice); + + mublasHandle_t handle; + mublasCreate(&handle); + + double alpha = 1.0f; + double beta = 0.0f; + + for (int i = 0; i < config.WARMUP_ITERATIONS; ++i) + { + mublasDgemm(handle, MUBLAS_OP_N, MUBLAS_OP_T, + M, N, K, &alpha, + d_A, M, + d_B, N, + &beta, + d_C, M); + } + + musaError_t syncError = musaDeviceSynchronize(); + auto start = std::chrono::high_resolution_clock::now(); + + if (syncError != musaSuccess) + { + std::cout << "MUSA error: " << musaGetErrorString(syncError) << std::endl; + } + + for (int i = 0; i < config.NUM_ITERATIONS; ++i) + { + mublasDgemm(handle, MUBLAS_OP_N, MUBLAS_OP_T, + M, N, K, &alpha, + d_A, M, + d_B, N, + &beta, + d_C, M); + } + syncError = musaDeviceSynchronize(); + auto end = std::chrono::high_resolution_clock::now(); + + if (syncError != musaSuccess) + { + std::cout << "MUSA error: " << musaGetErrorString(syncError) << std::endl; + } + auto duration = + std::chrono::duration_cast(end - start); + std::cout << "Average " << config.name << " Single Op Duration: " + << duration.count() / config.NUM_ITERATIONS << " us" << std::endl; + + double time_second = duration.count() / 1.0e6; + double flops = 2.0 * M * N * K * config.NUM_ITERATIONS; + double FLOPS = flops / time_second; + double TFLOPS = FLOPS / 1.0e12; + + std::cout << "[FlagPerf Result]" << "computation-FP64=" << TFLOPS << "TFLOPS" + << std::endl; + + musaMemcpy(h_C.data(), d_C, M * N * config.bytesPerElement, musaMemcpyDeviceToHost); + + musaFree(d_A); + musaFree(d_B); + musaFree(d_C); + + mublasDestroy(handle); +} + +int main(int argc, char* argv[]) { + + if (argc != 5) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + std::cerr << "Example: " << argv[0] << " 128 128 128 10" << std::endl; + return EXIT_FAILURE; + } + + int m = std::atoi(argv[1]); + int n = std::atoi(argv[2]); + int k = std::atoi(argv[3]); + int iter = std::atoi(argv[4]); + + std::cout << "========================================" << std::endl; + std::cout << "MatMul FP64 Test (MUSA)" << std::endl; + std::cout << "m = " << m << ", n = " << n << ", k = " << k << std::endl; + std::cout << "Test Iterations = " << iter << std::endl; + + M = m; + N = n; + K = k; + musaSetDevice(0); + PrecisionConfig fp64_PrecisionConfig = {sizeof(double), "FP64", iter, 40}; + + test(fp64_PrecisionConfig); + + return 0; +} diff --git a/base_test/matmul_test/fp64_tf32_src/gemm_tf32.cpp b/base_test/matmul_test/fp64_tf32_src/gemm_tf32.cpp new file mode 100644 index 0000000..6221eed --- /dev/null +++ b/base_test/matmul_test/fp64_tf32_src/gemm_tf32.cpp @@ -0,0 +1,678 @@ +/* Copyright @2020-2024 Moore Threads Technology Co., Ltd("Moore Threads"). All + * rights reserved. + * + * This software ("this software and its documentations" or "the software") is + * protected by Copyright and the information contained herein is confidential. + * + * The software contained herein is PROPRIETARY to Moore Threads and is being + * provided under the terms and conditions of a form of Moore Threads software + * license agreement by and between Moore Threads and Licensee ("License + * Agreement") or electronically accepted by Licensee. Notwithstanding any + * terms or conditions to the contrary in the License Agreement, copy or + * disclosure of the software to any third party without the express written + * consent of Moore Threads is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE + * AGREEMENT, MOORE THREADS MAKES NO REPRESENTATION ABOUT ANY WARRANTIES, + * INCLUDING BUT NOT LIMITED TO THE SUITABILITY OF THE SOFTWARE FOR ANY + * PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF + * ANY KIND. MOORE THREADS DISCLAIMS ALL WARRANTIES WITH REGARD TO THE + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL MOORE THREADS BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THE SOFTWARE. + */ +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +using qint8 = int8_t; + +#define SHOW printf + +namespace Eigen { + struct half; + struct bfloat16; +} +using Eigen::bfloat16; +using Eigen::half; + + +struct MatMulParam { + bool split_k{ false }; + bool trans_a{ false }; + bool trans_b{ true }; + int batch{ 1 }; + int m{ 6144 }; + int n{ 8192 }; + int k{ 19200 }; + double alpha{ 1.0 }; + double beta{ 0.0 }; + double gamma{ 0.0 }; + int mode{ 0 }; // 0 tensor, 1 scalar +}; + +#define CHECK_MUSA(...) \ + do { \ + int err = CheckMusaError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); \ + if (err) \ + exit(err); \ + } while (0) + +#define CHECK_ERR(...) \ + do { \ + int err = CheckError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); \ + if (err) \ + exit(err); \ + } while (0) + +int CheckMusaError(musaError_t code, const char* expr, const char* file, + int line) { + if (code) { + printf("MUSA error at %s:%d, code=%d (%s) in '%s'", file, line, (int)code, + musaGetErrorString(code), expr); + return 1; + } + return 0; +} + +int CheckError(bool code, const char* expr, const char* file, int line) { + if (code) { + printf("General error at %s:%d, code=%d (%s) in '%s'", file, line, + (int)code, "general error", expr); + return 1; + } + return 0; +} + +template +void GenerateRandom(Type* data, int64_t size, uint seed = 2333) { + // non-determistic seed source + // std::random_device rd; + // std::default_random_engine engine(rd()); + // constexpr auto seed = 2333; + std::default_random_engine engine(seed); + if (std::is_floating_point_v) { + std::uniform_real_distribution dist(0, 0); + for (auto i = 0; i < size; i++) { + data[i] = (Type)(dist(engine)); + } + } + else { + std::uniform_int_distribution dist(0, 0); + for (auto i = 0; i < size; i++) { + data[i] = (Type)(dist(engine)); + } + } +} + +void MemFree(void* ptr) { + if (ptr) { + musaFree(ptr); + } +} + +::musa::dnn::MemoryHandler MemoryFunc(size_t size) { + void* data = nullptr; + if (size) { + musaMalloc(&data, size); + musaMemset(data, 0, size); + } + return ::musa::dnn::MemoryHandler(data, MemFree); +} + +enum DType { + f32, + f16, + q8, + bf16, +}; + + +class TestMatMul { +public: + inline float F32MaskFormatTF32(float f) { + unsigned int t = 0; + std::memcpy(&t, &f, sizeof(f)); + // 1110 0000 0000 0000 + t = t & 0xffffe000; + std::memcpy(&f, &t, sizeof(f)); + return f; + } + + // Random num generator + + + TestMatMul(const musaStream_t& _stream, const int _device_id, const DType _dtype, const MatMulParam _param, const int _iters) + { + stream = _stream; + device_id = _device_id; + dtype = _dtype; + dtype_size = 4; + + switch (dtype) { + case DType::f32: + dtype_str = "float32"; + dtype_size = 4; + break; + case DType::f16: + dtype_str = "float16"; + dtype_size = 2; + break; + case DType::bf16: + dtype_str = "bfloat16"; + dtype_size = 2; + break; + case DType::q8: + dtype_str = "qint8"; + dtype_size = 1; + break; + default: + bool DType_Not_Suppoted = true; + CHECK_ERR(DType_Not_Suppoted); + break; + } + split_k = _param.split_k; + trans_a = _param.trans_a; + trans_b = _param.trans_b; + batch = _param.batch; + m = _param.m; + n = _param.n; + k = _param.k; + alpha = _param.alpha; + beta = _param.beta; + gamma = _param.gamma; + mode = _param.mode; + + iters = _iters; + + handle = new ::musa::dnn::Handle(device_id); + handle->SetStream(stream); + }; + ~TestMatMul() { +#define FREE_H(_PTR) \ + if (_PTR != nullptr) { \ + operator delete(_PTR); \ + } +#define FREE_D(_PTR) \ + if (_PTR != nullptr) { \ + CHECK_MUSA(musaFree(_PTR)); \ + } + + FREE_H(h_buf_a); + FREE_H(h_buf_b); + FREE_H(h_buf_c); + FREE_H(h_buf_o); + FREE_H(h_buf_z); + + FREE_D(d_a); + FREE_D(d_b); + FREE_D(d_c); + FREE_D(d_z); + + FREE_D(d_base); + FREE_D(d_bool); + FREE_D(d_nonz); + FREE_H(h_nonz); + +#undef FREE_H +#undef FREE_D + + if (handle) { + delete handle; + } + }; + + bool Test() { + // check parameters + CheckParams(); + // initial memory && dnn tensor op + Init(); + // warm up && prepare base golden + int warmup_iters = 40; + for (int i = 0; i < warmup_iters; i++) { + Exec(); + } + // main loop + float elapsed_ms = 0.f; + musaEvent_t start, stop; + if (performance) { + CHECK_MUSA(musaEventCreate(&start)); + CHECK_MUSA(musaEventCreate(&stop)); + CHECK_MUSA(musaEventRecord(start, stream)); + } + + std::chrono::milliseconds bubble_time(bubble); + std::chrono::milliseconds duration_time(duration); + std::chrono::milliseconds show_gap_time(60000); + int show_gap_count = 0; + auto start_time = std::chrono::steady_clock::now(); + auto current_time = start_time; + const bool blocking = (bubble > 0) || (iters == 0 && duration > 0); + int stable_check_gap_count = 1; + int run_iters_count = 0; + int i = 0; + while ((iters > 0 && i < iters) || + (iters == 0 && (current_time - start_time) <= duration_time)) { + // operator running + Exec(blocking); + + if (bubble > 0) { + // SHOW("sleeping %d ms\n", bubble); + std::this_thread::sleep_for(bubble_time); + } + current_time = std::chrono::steady_clock::now(); + if ((iters == 0 && duration > 0) && + (current_time - start_time) > show_gap_time * show_gap_count) { + std::cout << "--- now execution time passed " + << (show_gap_time * show_gap_count).count() << std::endl; + show_gap_count++; + } + // SHOW("run loop %d\n", run_iters_count); + i++, stable_check_gap_count++, run_iters_count++; + } + // performance testing and stability checking are mutually exclusive + if (performance) { + CHECK_MUSA(musaEventRecord(stop, stream)); + CHECK_MUSA(musaEventSynchronize(stop)); + CHECK_MUSA(musaEventElapsedTime(&elapsed_ms, start, stop)); + elapsed_ms = elapsed_ms / run_iters_count; + ShowPerformance(elapsed_ms, (size_t)m * n * k * 2 / elapsed_ms * 1e-6, + !stable_check); + CHECK_MUSA(musaEventDestroy(start)); + CHECK_MUSA(musaEventDestroy(stop)); + } + return true; + } + + void ShowPerformance(float t, float gops, bool credible) { + // SHOW("dev_time : %f, gops : %f %s\n", t, credible ? gops : 0.f, + // credible + // ? " " + // : " - the performance is not credible when enable stable checking"); + SHOW("Average TF32 Single Op Duration:%f us\n", t * 1.0e3); + SHOW("[FlagPerf Result]computation-TF32=%f TFLOPS\n", gops / 1.0e3); + + } + +private: + void* h_buf_a = nullptr; + void* h_buf_b = nullptr; + void* h_buf_c = nullptr; + void* h_buf_o = nullptr; + void* h_buf_z = nullptr; + + void* d_a = nullptr; + void* d_b = nullptr; + void* d_c = nullptr; + void* d_z = nullptr; + + void* d_base = nullptr; + void* d_bool = nullptr; + void* d_nonz = nullptr; + int64_t* h_nonz = nullptr; + + bool result_check = false; + bool stable_check = false; + bool stable_check_gpu = false; + bool performance = true; + bool verbose = false; + int iters = 1; + int duration = 0; + int bubble = 0; + int gap = 1; + uint seed = 2333; + + DType dtype = DType::f32; + std::string dtype_str = "float32"; + size_t dtype_size = 4; + bool split_k = false; + bool trans_a = false; + bool trans_b = false; + int batch = 1; + int m = 1; + int n = 1; + int k = 1; + double alpha = 1.0; + double beta = 0.0; + double gamma = 0.0; + int mode = 0; + + // qint8 variables + const float scale_a = 1.f / 32.f; + const float scale_b = 1.f / 32.f; + const float scale_c = 32.f; + + // mudnn variables + musaStream_t stream; + int device_id; + ::musa::dnn::Handle* handle; + ::musa::dnn::MatMul op; + + ::musa::dnn::Tensor tensor_a; + ::musa::dnn::Tensor tensor_b; + ::musa::dnn::Tensor tensor_c; + ::musa::dnn::Tensor tensor_z; + ::musa::dnn::Tensor tensor_base; + ::musa::dnn::Tensor tensor_bool; + ::musa::dnn::Tensor tensor_nonz; + +private: + + + ::musa::dnn::Tensor::Type GetmuDNNType(const std::string& dtype) { + using T = ::musa::dnn::Tensor::Type; + static std::map type_mapping = { + {"int8", T::INT8}, + {"int16", T::INT16}, + {"int32", T::INT32}, + + {"int", T::INT64}, + {"int64", T::INT64}, + + {"uint8", T::UINT8}, + {"uint16", T::UINT16}, + {"uint32", T::UINT32}, + + {"uint", T::UINT64}, + {"uint64", T::UINT64}, + + {"half", T::HALF}, + {"float16", T::HALF}, + {"bfloat16", T::BFLOAT16}, + + {"float32", T::FLOAT}, + {"qint8", T::QINT8}, + + {"float", T::FLOAT}, + {"float64", T::DOUBLE}, + {"double", T::DOUBLE}, + + {"bool", T::BOOL}, + }; + if (type_mapping.find(dtype) != type_mapping.end()) { + return type_mapping.at(dtype); + } + else { + std::cerr << "GetmuDNNType error : " << dtype << std::endl; + return type_mapping.at("float"); + } + } + bool CheckParams() { + bool pass = true; + // param checking + if (mode != 0 && mode != 1) { + std::cerr << "MatMul mode setting error, fallback 0" << std::endl; + mode = 0; + } + if (m <= 0 || n <= 0 || k <= 0) { + std::cerr << "MatMul param setting error, fallback 1" << std::endl; + m = m > 0 ? m : 1; + n = n > 0 ? n : 1; + k = k > 0 ? k : 1; + } + if (gamma != 0) { + std::cerr << "MatMul unsupported gamma != 0 temporarily, fallback 0" + << std::endl; + gamma = 0; + } + if (beta != 0) { + if (mode == 0) { + std::cerr << "MatMul unsupported beta != 0 when mode == 0, fallback 0" + << std::endl; + beta = 0; + } + + } + if (dtype == DType::q8) { + // To be removed when binary supports QINT8 + if (stable_check_gpu) { + std::cerr + << "MatMul unsupported qint8 for stable_check_gpu, fallback cpu " + << std::endl; + stable_check_gpu = false; + } + if (mode != 0) { + std::cerr << "MatMul mode must be 0 when qint8, fallback 0" + << std::endl; + mode = 0; + } + } + + return pass; + } + + bool Init() { + size_t nr_elem_a = (size_t)(m)*k; + size_t nr_elem_b = (size_t)(k)*n; + size_t nr_elem_c = (size_t)(m)*n; + size_t nr_elem_z = (size_t)(n); + + size_t size_a = nr_elem_a * dtype_size; + size_t size_b = nr_elem_b * dtype_size; + size_t size_c = nr_elem_c * dtype_size; + size_t size_z = nr_elem_z * dtype_size; + + size_t mem_total, mem_free; + CHECK_MUSA(musaMemGetInfo(&mem_free, &mem_total)); + size_t available_gpu_mem = mem_free; + size_t total_gpu_mem = mem_total; + size_t need_gpu_mem = size_a + size_b + size_c; + if (gamma != 0) { + need_gpu_mem += size_z; + } + if (stable_check && stable_check_gpu) { + need_gpu_mem += + size_c + sizeof(bool) * nr_elem_c + sizeof(int64_t) * m * n * 2; + } + if ((need_gpu_mem > available_gpu_mem) || verbose) { + SHOW("%s : Need Device Memory %.2f GiB, Available Device Memory %.2f GiB " + "(Total %.2f GiB)\n", + (need_gpu_mem > available_gpu_mem) ? "Error" : "Verbose", + need_gpu_mem / 1024.f / 1024 / 1024, + available_gpu_mem / 1024.f / 1024 / 1024, + total_gpu_mem / 1024.f / 1024 / 1024); + } + CHECK_ERR(need_gpu_mem > available_gpu_mem); + + // host buffer + h_buf_a = operator new(size_a); // new char[size_a](); + h_buf_b = operator new(size_b); // new char[size_b](); + h_buf_c = operator new(size_c); // new char[size_c](); + h_buf_o = operator new(size_c); // new char[size_c](); + + // host data initialization + if (dtype == DType::f16) { + GenerateRandom((half*)(h_buf_a), nr_elem_a, seed); + GenerateRandom((half*)(h_buf_b), nr_elem_b, seed); + GenerateRandom((half*)(h_buf_c), nr_elem_c, seed); + + } + else if (dtype == DType::bf16) { + GenerateRandom((bfloat16*)(h_buf_a), nr_elem_a, seed); + GenerateRandom((bfloat16*)(h_buf_b), nr_elem_b, seed); + GenerateRandom((bfloat16*)(h_buf_c), nr_elem_c, seed); + } + else if (dtype == DType::q8) { + GenerateRandom((qint8*)(h_buf_a), nr_elem_a, seed); + GenerateRandom((qint8*)(h_buf_b), nr_elem_b, seed); + GenerateRandom((qint8*)(h_buf_c), nr_elem_c, seed); + } + else { + GenerateRandom((float*)(h_buf_a), nr_elem_a, seed); + GenerateRandom((float*)(h_buf_b), nr_elem_b, seed); + GenerateRandom((float*)(h_buf_c), nr_elem_c, seed); + } + + // tensor float 32 format + if ((dtype == DType::f32) && mode == 0) { + for (size_t i = 0; i < nr_elem_a; i++) { + ((float*)h_buf_a)[i] = (float)F32MaskFormatTF32(((float*)h_buf_a)[i]); + } + for (size_t i = 0; i < nr_elem_b; i++) { + ((float*)h_buf_b)[i] = (float)F32MaskFormatTF32(((float*)h_buf_b)[i]); + } + for (size_t i = 0; i < nr_elem_c; i++) { + ((float*)h_buf_c)[i] = (float)F32MaskFormatTF32(((float*)h_buf_c)[i]); + } + } + + // device buffer + CHECK_MUSA(musaMalloc(&d_a, size_a)); + CHECK_MUSA(musaMalloc(&d_b, size_b)); + CHECK_MUSA(musaMalloc(&d_c, size_c)); + + // transfer host data to device + + CHECK_MUSA(musaMemcpy(d_a, h_buf_a, size_a, musaMemcpyHostToDevice)); + CHECK_MUSA(musaMemcpy(d_b, h_buf_b, size_b, musaMemcpyHostToDevice)); + CHECK_MUSA(musaMemcpy(d_c, h_buf_c, size_c, musaMemcpyHostToDevice)); + + // host and device buffer for gamma + if (gamma != 0) { + h_buf_z = new char[size_z](); + CHECK_MUSA(musaMalloc(&d_z, size_z)); + CHECK_MUSA(musaMemcpy(d_z, h_buf_z, size_z, musaMemcpyHostToDevice)); + if (dtype == DType::f16) { + GenerateRandom((half*)(h_buf_z), nr_elem_z, seed); + } + else if (dtype == DType::bf16) { + GenerateRandom((bfloat16*)(h_buf_z), nr_elem_z, seed); + } + else if (dtype == DType::q8) { + GenerateRandom((qint8*)(h_buf_z), nr_elem_z, seed); + } + else { + GenerateRandom((float*)(h_buf_z), nr_elem_z, seed); + } + } + + + ::musa::dnn::Tensor::Type ttype = GetmuDNNType(dtype_str); + tensor_a.SetAddr(d_a); + tensor_a.SetType(ttype); + if (DType::q8 == dtype) { + tensor_a.SetQuantizationInfo(scale_a); + } + if (trans_a) { + tensor_a.SetNdInfo({ k, m }); + } + else { + tensor_a.SetNdInfo({ m, k }); + } + + tensor_b.SetAddr(d_b); + tensor_b.SetType(ttype); + if (DType::q8 == dtype) { + tensor_b.SetQuantizationInfo(scale_b); + } + if (trans_b) { + tensor_b.SetNdInfo({ n, k }); + } + else { + tensor_b.SetNdInfo({ k, n }); + } + + tensor_c.SetAddr(d_c); + tensor_c.SetType(ttype); + tensor_c.SetNdInfo({ m, n }); + if (DType::q8 == dtype) { + tensor_c.SetQuantizationInfo(scale_c); + } + + tensor_z.SetAddr(d_z); + tensor_z.SetType(ttype); + tensor_z.SetNdInfo({ n }); + + CHECK_MUSA(musaStreamSynchronize(stream)); + CHECK_MUSA(musaDeviceSynchronize()); + + + op.SetTranspose(trans_a, trans_b); + // op.SetSplitK(split_k); + op.SetAlpha(alpha); + op.SetBeta(beta); + op.SetGamma(gamma); + op.SetComputeMode(static_cast<::musa::dnn::MatMul::ComputeMode>(mode)); + + return true; + } + + void Exec(bool sync = false) { + CHECK_ERR(::musa::dnn::Status::SUCCESS != + op.RunWithBiasAdd(*handle, tensor_c, tensor_a, tensor_b, tensor_z, MemoryFunc)); + CHECK_MUSA(musaGetLastError()); + if (sync) { + CHECK_MUSA(musaStreamSynchronize(stream)); + } + } +}; + +int RunMatMul() { + + + int device_id = 5; + CHECK_MUSA(musaGetDevice(&device_id)); + + MatMulParam param; + const int iters = 42000; + musaStream_t stream; + CHECK_MUSA(musaStreamCreate(&stream)); + TestMatMul test_mm(stream, device_id, DType::f32, param, iters); + bool ret = test_mm.Test(); + CHECK_MUSA(musaStreamDestroy(stream)); + return ret; +} + + +int main(int argc, char* argv[]) { + + if (argc != 5) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + std::cerr << "Example: " << argv[0] << " 128 128 128 10" << std::endl; + return EXIT_FAILURE; + } + + int m = std::atoi(argv[1]); + int n = std::atoi(argv[2]); + int k = std::atoi(argv[3]); + int iter = std::atoi(argv[4]); + + std::cout << "========================================" << std::endl; + std::cout << "MatMul TF32 Test (MUSA)" << std::endl; + std::cout << "m = " << m << ", n = " << n << ", k = " << k << std::endl; + std::cout << "Test Iterations = " << iter << std::endl; + + int device_id = 0; + CHECK_MUSA(musaGetDevice(&device_id)); + + MatMulParam param; + param.m = m; + param.n = n; + param.k = k; + const int iters = iter; + musaStream_t stream; + CHECK_MUSA(musaStreamCreate(&stream)); + TestMatMul test_mm(stream, device_id, DType::f32, param, iters); + bool ret = test_mm.Test(); + CHECK_MUSA(musaStreamDestroy(stream)); + return ret; +} diff --git a/base_test/matmul_test/test_gemm_f32_f16_bf16_q8_fp8.sh b/base_test/matmul_test/test_gemm_f32_f16_bf16_q8_fp8.sh new file mode 100644 index 0000000..736c237 --- /dev/null +++ b/base_test/matmul_test/test_gemm_f32_f16_bf16_q8_fp8.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +input_data=$(cat <<'EOF' +128 128 128 +256 256 256 +512 512 512 +1024 1024 1024 +2048 2048 2048 +4096 4096 4096 +8192 8192 8192 +4098 4098 4098 +8190 8190 8190 +EOF +) +test_iter=1000 + +TEST_TYPES=("f32" "f16" "bf16" "q8" "float8_e4m3" "float8_e5m2") +# TEST_TYPES=("f32") +LOG_DIR="mudnn_bench_logs" +mkdir -p "$LOG_DIR" +log_file="${LOG_DIR}/bench_f32_f16_bf16_q8_fp8.log" +> "$log_file" + +for type in "${TEST_TYPES[@]}"; do + echo "开始测试数据类型:$type" + while IFS=$'\t' read -r m n k; do + m=$(echo "$m" | tr -d ' ') + n=$(echo "$n" | tr -d ' ') + k=$(echo "$k" | tr -d ' ') + echo "$m $n $k" + + if [[ -n "$m" && -n "$n" && -n "$k" ]]; then + MUSA_VISIBLE_DEVICES=7 ../bin/mudnn_bench -m \ + -t "$type" \ + --mm_m="$m" --mm_n="$n" --mm_k="$k" \ + --mm_mode=0 \ + --tm i \ + --tmv "$test_iter" \ + -p \ + >> "$log_file" 2>&1 + sleep 2 + fi + done < <(echo "$input_data") +done + +python exetrct_log_tools/summarize_f32_f16_bf16_q8_fp8_log.py "$log_file" + +echo "所有测试完成!日志目录:$LOG_DIR" diff --git a/base_test/matmul_test/test_gemm_fp64_tf32.sh b/base_test/matmul_test/test_gemm_fp64_tf32.sh new file mode 100644 index 0000000..924c556 --- /dev/null +++ b/base_test/matmul_test/test_gemm_fp64_tf32.sh @@ -0,0 +1,84 @@ +#!/bin/bash + +# 输入矩阵大小列表 +input_data=$(cat <<'EOF' +128 128 128 +256 256 256 +512 512 512 +1024 1024 1024 +2048 2048 2048 +4096 4096 4096 +8192 8192 8192 +4098 4098 4098 +8190 8190 8190 +8192 768 8192 +EOF +) + +# 每组测试迭代次数 +test_iter=1000 + +# 测试类型列表 +TEST_TYPES=("fp64" "tf32") + +# GEMM 可执行文件目录 +EXE_DIR="./fp64_tf32_src" + +# 日志目录 +LOG_DIR="mudnn_bench_logs" +mkdir -p "$LOG_DIR" +ABS_LOG_DIR=$(realpath "$LOG_DIR") +log_file="${ABS_LOG_DIR}/bench_fp64_tf32_types.log" +> "$log_file" + +# Python 分析脚本路径 +PYTHON_SUMMARIZE="exetrct_log_tools/summarize_fp64_tf32_log.py" + +for type in "${TEST_TYPES[@]}"; do + echo "==============================" + echo "开始测试:$type" + echo "==============================" + + # 根据类型选择可执行文件 + if [[ "$type" == "fp64" ]]; then + exe="${EXE_DIR}/gemm_fp64" + elif [[ "$type" == "tf32" ]]; then + exe="${EXE_DIR}/gemm_tf32" + else + echo "未知类型: $type" + continue + fi + + # 检查可执行文件是否存在 + if [[ ! -f "$exe" ]]; then + echo "错误:找不到可执行文件 $exe" + continue + fi + + # 遍历矩阵大小 + while read -r m n k; do + # 清理可能的空格 + m=$(echo "$m" | tr -d ' ') + n=$(echo "$n" | tr -d ' ') + k=$(echo "$k" | tr -d ' ') + + echo "矩阵大小: M=$m, N=$n, K=$k" + + if [[ -n "$m" && -n "$n" && -n "$k" ]]; then + # 执行 GEMM 测试并记录日志 + MUSA_VISIBLE_DEVICES=7 "$exe" "$m" "$n" "$k" "$test_iter" >> "$log_file" 2>&1 + sleep 1 + fi + done <<< "$input_data" + +done + +# 调用 Python 分析脚本 +if [[ -f "$PYTHON_SUMMARIZE" ]]; then + python "$PYTHON_SUMMARIZE" "$log_file" +else + echo "警告:Python 分析脚本不存在: $PYTHON_SUMMARIZE" +fi + +echo "所有测试完成!日志目录:$ABS_LOG_DIR" + diff --git a/base_test/matmul_test/test_gemm_mixed.sh b/base_test/matmul_test/test_gemm_mixed.sh new file mode 100644 index 0000000..076a95d --- /dev/null +++ b/base_test/matmul_test/test_gemm_mixed.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +set -e + +input_data=$(cat <<'EOF' +128 128 128 +256 256 256 +512 512 512 +1024 1024 1024 +2048 2048 2048 +4096 4096 4096 +8192 8192 8192 +4098 4098 4098 +8190 8190 8190 +8192 768 8192 +EOF +) +test_iter=1000 + +TEST_TYPES=("f16:f16:f32:f32" "bf16:bf16:f32:f32" "f32" "int8" "q8:q8:f32:f32" "bf16:q4:bf16:bf16" "float8_e4m3:float8_e4m3:f16:f16") +# TEST_TYPES=("f32") +LOG_DIR="mudnn_bench_logs" +mkdir -p "$LOG_DIR" +log_file="${LOG_DIR}/bench_fix_matmul.log" +> "$log_file" + +# 先测试命令是否存在 +if [ ! -f "../bin/mudnn_bench" ]; then + echo "错误:未找到 ../bin/mudnn_bench 可执行文件" | tee -a "$log_file" + exit 1 +fi + +echo "开始测试,日志文件:$log_file" + +for type in "${TEST_TYPES[@]}"; do + echo "开始测试数据类型:$type" | tee -a "$log_file" + + # 使用 while 循环逐行读取 + echo "$input_data" | while IFS= read -r line; do + # 跳过空行 + [ -z "$line" ] && continue + + # 使用 awk 或直接读取三个数字 + # 方法1:使用 read + read m n k <<< "$line" + + # 或者方法2:使用 awk(更可靠) + # m=$(echo "$line" | awk '{print $1}') + # n=$(echo "$line" | awk '{print $2}') + # k=$(echo "$line" | awk '{print $3}') + + echo "测试: M=$m, N=$n, K=$k, Type=$type" | tee -a "$log_file" + + # 检查参数是否正确 + if ! [[ "$m" =~ ^[0-9]+$ ]] || ! [[ "$n" =~ ^[0-9]+$ ]] || ! [[ "$k" =~ ^[0-9]+$ ]]; then + echo "错误:参数不是数字: m=$m, n=$n, k=$k" | tee -a "$log_file" + continue + fi + + # 临时保存命令 + cmd="MUSA_VISIBLE_DEVICES=7 ../bin/mudnn_bench -m --mm_m=\"$m\" --mm_n=\"$n\" --mm_k=\"$k\" --warmup 30 --tm i --tmv \"$test_iter\" -p -c -t \"$type\"" + echo "执行命令: $cmd" >> "$log_file" + + # 执行命令并捕获退出状态 + if MUSA_VISIBLE_DEVICES=7 ../bin/mudnn_bench -m \ + --mm_m="$m" --mm_n="$n" --mm_k="$k" \ + --warmup 30 \ + --tm i \ + --tmv "$test_iter" \ + -p \ + -c \ + -t "$type" >> "$log_file" 2>&1; then + echo "测试成功: M=$m, N=$n, K=$k, Type=$type" | tee -a "$log_file" + else + exit_code=$? + echo "测试失败: M=$m, N=$n, K=$k, Type=$type, 退出码: $exit_code" | tee -a "$log_file" + fi + + echo "----------------------------------------" >> "$log_file" + sleep 2 + done +done + +python sexetrct_log_tool/summary_mixed_data.py "$log_file" +echo "所有测试完成!日志目录:$LOG_DIR" +echo "查看日志:cat $log_file" diff --git a/script/monitor/README.md b/base_test/monitor/README.md similarity index 100% rename from script/monitor/README.md rename to base_test/monitor/README.md diff --git a/script/monitor/monitor_gpu.sh b/base_test/monitor/monitor_gpu.sh old mode 100755 new mode 100644 similarity index 100% rename from script/monitor/monitor_gpu.sh rename to base_test/monitor/monitor_gpu.sh From b8f06d97a3a163dcacce5ec4197b6b81326cf97e Mon Sep 17 00:00:00 2001 From: Wang Kang Date: Fri, 5 Dec 2025 16:08:35 +0800 Subject: [PATCH 2/4] =?UTF-8?q?=E8=A1=A5=E5=85=85=E6=B7=B7=E5=90=88?= =?UTF-8?q?=E7=B2=BE=E5=BA=A6=E8=AF=B4=E6=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- base_test/matmul_test/READE.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/base_test/matmul_test/READE.md b/base_test/matmul_test/READE.md index c54a072..68b7532 100644 --- a/base_test/matmul_test/READE.md +++ b/base_test/matmul_test/READE.md @@ -1,4 +1,4 @@ -Matmul 自动化测试脚本 +Matmul 自动化测试 # 1. 脚本说明 matmul 存放位置: ```shell @@ -9,7 +9,8 @@ mudnn_bench │ └── mudnn_bench-x.x.x ├── matmul_test ``` -mudnn_bench 示例: +mudnn_bench 示例: +**部分旧版本mudnn_bench和mudnn版本不支持混合精度测试,需要和开发者做确认.** ```shell # 示例 1:单卡,大矩阵,f32 @@ -26,7 +27,9 @@ MUSA_VISIBLE_DEVICES=3 ./bin/mudnn_bench -m --mm_m 2048 --mm_n 2048 --mm_k 2048 ``` # 2. 测试 +可在测试脚本中自行批量配置测试MNK,warmup,iter等。 ## 2.1 fp64, tf32 测试 +注意:fp64和tf32 数据类型调用非 mudnn 接口 ```shell # 1. 编译 bash ./fp64_tf32_src/build_gemm_tf32.sh @@ -38,6 +41,12 @@ bash test_gemm_fp64_tf32.sh ``` ## 2.2 f32_f16_bf16_q8_fp8 测试 +mudnn_bench 测试矩阵value默认说明: +- 浮点:-0.5~0.5 +- fp8: 整型-10~10转浮点 +- qint4:-7~7 +- 整型:-127~127 +> 部分版本 mudnn_bench 工具支持全 0 测试(参数 `-z` 实现),需要和开发者确认 ```shell bash test_gemm_f32_f16_bf16_q8_fp8.sh ``` From d9b077c000844a900519247e2bed5bda2646b40c Mon Sep 17 00:00:00 2001 From: Wang Kang Date: Fri, 5 Dec 2025 16:51:11 +0800 Subject: [PATCH 3/4] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E8=AF=B4=E6=98=8E?= =?UTF-8?q?=E4=BB=A5=E5=8F=8A=E7=BC=96=E8=AF=91=E8=B7=AF=E5=BE=84=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- base_test/matmul_test/READE.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/base_test/matmul_test/READE.md b/base_test/matmul_test/READE.md index 68b7532..59ed02f 100644 --- a/base_test/matmul_test/READE.md +++ b/base_test/matmul_test/READE.md @@ -1,7 +1,8 @@ Matmul 自动化测试 # 1. 脚本说明 -matmul 存放位置: +matmul 放置位置: ```shell +# mudnn_bench 默认存放在 /usr/local/musa/ 下 mudnn_bench ├── bench_test_matmul.sh ├── bin @@ -32,9 +33,10 @@ MUSA_VISIBLE_DEVICES=3 ./bin/mudnn_bench -m --mm_m 2048 --mm_n 2048 --mm_k 2048 注意:fp64和tf32 数据类型调用非 mudnn 接口 ```shell # 1. 编译 -bash ./fp64_tf32_src/build_gemm_tf32.sh +cd ./fp64_tf32_src -bash ./fp64_tf32_src/build_gemm_fp64.sh +bash build_gemm_tf32.sh +bash build_gemm_fp64.sh ## 2. 测试 bash test_gemm_fp64_tf32.sh From 1de46505642ba29abc0729d491b1b19f20680c91 Mon Sep 17 00:00:00 2001 From: Wang Kang Date: Tue, 13 Jan 2026 13:53:49 +0800 Subject: [PATCH 4/4] add gpu monitor --- base_test/monitor/README.md | 304 ++++++++++++++++++++-- base_test/monitor/README_sh.md | 42 +++ base_test/monitor/mthreads_gpu_monitor.py | 296 +++++++++++++++++++++ 3 files changed, 616 insertions(+), 26 deletions(-) create mode 100644 base_test/monitor/README_sh.md create mode 100644 base_test/monitor/mthreads_gpu_monitor.py diff --git a/base_test/monitor/README.md b/base_test/monitor/README.md index 3d353f3..3c8c93a 100644 --- a/base_test/monitor/README.md +++ b/base_test/monitor/README.md @@ -1,42 +1,294 @@ -# GPU 监控脚本使用说明 +# GPU Monitor for MTT (Mthreads) -该脚本用于定时采集指定 GPU 的温度和图形频率,支持高温/降频报警,并可将结果记录到日志文件中,适用于性能测试与运行状态监控。 +一个轻量级的 GPU 监控工具,用于实时采集和监控 Mthreads GPU 的性能指标。 ---- +## 功能特性 -## ✅ 脚本功能 +- 🚀 **实时监控**:每5秒自动刷新 GPU 信息 +- 🔔 **阈值告警**:支持温度和显存占比的告警机制 +- 📊 **CSV 日志**:自动记录 GPU 数据到 CSV 文件 +- 🔄 **灵活使用**:支持单次读取和循环监控两种模式 +- 🧵 **多线程**:后台线程处理,不阻塞主程序 +- 📈 **数据导出**:支持 Dict 和对象两种格式获取数据 -- 支持指定 GPU 设备编号 -- 可自定义刷新时间间隔和记录次数 -- 实时记录温度(℃)与图形频率(MHz) -- 高温(>95°C)或降频(<1750MHz)触发报警(门限可根据实际设备调整) -- 监控结束后自动统计报警次数 -- 默认输出日志文件为:`gpu_monitor_log.txt` +## 环境要求 ---- +- Python 3.6+ +- `mthreads-gmi` 命令可用 -## ⚙️ 参数说明 +## 安装 -| 参数 | 含义 | 示例 | -|------|--------------------|-----------------| -| `-d` | GPU 设备编号 | `-d 0` | -| `-i` | 刷新时间(单位:秒)| `-i 1` | -| `-n` | 记录次数(默认无限)| `-n 10` | +```bash +# 直接使用(无需额外依赖) +python3 mthreads_gpu_monitor.py +``` -> 如未指定 `-n`,脚本将持续运行,直到手动停止。 +## 使用方法 ---- +### 方式1:单次读取 GPU 信息 -## 🚀 使用示例 +```python +from mthreads_gpu_monitor import GPUMonitor -### ✅ 前台运行 +# 创建监控对象 +monitor = GPUMonitor() -```bash -./monitor_gpu.sh -d 0 -i 1 -n 10 +# 读取一次 GPU 信息 +monitor.update() + +# 获取所有 GPU 信息(List[Dict] 格式) +all_gpus = monitor.to_dict() +print(all_gpus) + +# 获取单张 GPU 信息 +gpu0 = monitor.get_gpu(0) +print(f"GPU 0 温度: {gpu0.temperature}°C") + +# 获取多张 GPU 信息 +gpus = monitor.get_gpu([0, 1, 2]) +for gpu in gpus: + print(gpu) ``` -### ✅ 后台运行 +### 方式2:循环监控(后台自动刷新) + +```python +from mthreads_gpu_monitor import GPUMonitor + +# 创建监控对象(配置告警阈值和CSV日志) +monitor = GPUMonitor( + refresh_interval=5, # 刷新间隔(秒) + csv_path="gpu_metrics.csv", # CSV 日志文件路径 + alert_config={ + "temperature": 80, # 温度告警阈值(°C) + "memory_used_ratio": 0.9, # 显存占比告警阈值(90%) + }, +) + +# 启动后台监控线程 +monitor.start() + +# 主程序继续执行(监控在后台运行) +import time +time.sleep(60) + +# 停止监控 +monitor.stop() ``` -nohup ./monitor_gpu.sh -d 0 -i 1 -n 100 > /dev/null 2>&1 & -tail -f gpu_monitor_log.txt + +### 方式3:自定义告警回调 + +```python +from mthreads_gpu_monitor import GPUMonitor, GPUInfo + +def custom_alert(gpu: GPUInfo, msg: str): + """自定义告警处理函数""" + print(f"【自定义告警】{msg}") + # 可以在这里发送邮件、钉钉等 + +monitor = GPUMonitor( + refresh_interval=5, + csv_path="gpu_metrics.csv", + alert_config={ + "temperature": 80, + "memory_used_ratio": 0.9, + }, + alert_callback=custom_alert, # 传入自定义回调函数 +) + +monitor.start() ``` + +## 类和方法说明 + +### `GPUInfo` 类 + +GPU 信息的数据类,包含以下属性: + +| 属性 | 类型 | 说明 | +|------|------|------| +| `index` | int | GPU 索引号 | +| `model` | str | GPU 型号 | +| `temperature` | float | 温度(°C) | +| `power` | float | 功耗(W) | +| `utilization` | float | GPU 利用率(%) | +| `memory_total` | float | 显存总量(MiB) | +| `memory_used` | float | 显存已用(MiB) | +| `memory_used_ratio` | float | 显存占比(0.0-1.0) | + +#### 方法 + +- `to_dict()` - 返回 Dict 格式的数据 +- `__repr__()` - 返回对象的字符串表示 + +### `GPUMonitor` 类 + +GPU 监控器主类。 + +#### 初始化参数 + +```python +GPUMonitor( + refresh_interval: int = 5, # 刷新间隔(秒) + csv_path: Optional[str] = None, # CSV 日志路径 + alert_config: Optional[Dict[str, float]] = None, # 告警配置 + alert_callback: Optional[Callable[[GPUInfo, str], None]] = None # 告警回调 +) +``` + +#### 主要方法 + +| 方法 | 说明 | +|------|------| +| `update()` | 立即更新一次 GPU 信息(含告警和CSV记录) | +| `start()` | 启动后台监控线程(定时调用 update)| +| `stop()` | 停止后台监控线程 | +| `to_dict()` | 返回所有 GPU 信息的 Dict 列表 | +| `get_gpu(index)` | 按索引获取单张或多张 GPU 信息 | + +## 示例输出 + +### 方式1:单次读取 +``` +所有 GPU 信息: +[ + { + 'index': 0, + 'model': 'MTT S4000', + 'temperature': 75.0, + 'power': 274.7, + 'utilization': 0.0, + 'memory_total': 49152.0, + 'memory_used': 516.0, + 'memory_used_ratio': 0.0105 + }, + ... +] + +第0号 GPU 的 memory_total 属性: +49152.0 +``` + +### 方式2:循环监控 +``` +GPU 监控程序已启动... +每 5 秒刷新一次,温度 ≥80°C 或显存占比 ≥90% 时告警 +CSV日志保存到: gpu_metrics.csv +按 Ctrl+C 停止监控 + +[2026-01-13 12:29:55] GPU Monitor Status: +-------------------------------------------------------------------------------- +GPU 0 (MTT S4000): + 温度: 75.0°C | 功耗: 274.7W + 显存: 516/ 49152 MiB ( 1.0%) + 利用率: 0.0% +GPU 1 (MTT S4000): + 温度: 63.0°C | 功耗: 253.9W + 显存: 516/ 49152 MiB ( 1.0%) + 利用率: 0.0% +... +``` + +## CSV 日志格式 + +自动生成的 CSV 文件包含以下列: + +```csv +timestamp,gpu_index,model,temperature,utilization,memory_used,memory_total,power +2026-01-13T12:29:55.123456,0,MTT S4000,75.0,0.0,516,49152,274.7 +2026-01-13T12:29:55.123456,1,MTT S4000,63.0,0.0,516,49152,253.9 +``` + +## 告警机制 + +### 默认告警 + +当以下条件满足时,会触发告警: + +1. **温度告警**:`temperature >= alert_config["temperature"]` +2. **显存告警**:`memory_used_ratio >= alert_config["memory_used_ratio"]` + +### 告警输出 + +``` +[ALERT] GPU 0 temperature exceeded | temp=85.5C mem_ratio=0.55 +[ALERT] GPU 2 memory exceeded | temp=70.0C mem_ratio=0.92 +``` + +### 自定义告警 + +通过 `alert_callback` 参数传入自定义函数处理告警: + +```python +def send_alert_email(gpu: GPUInfo, msg: str): + # 发送邮件 + pass + +monitor = GPUMonitor(alert_callback=send_alert_email) +``` + +## 常见问题 + +### Q: 如何在实际程序中集成此监控工具? + +A: 启动监控线程后,主程序可以继续执行其他任务,监控在后台运行: + +```python +monitor = GPUMonitor(...) +monitor.start() + +# 主程序代码 +for i in range(100): + # 处理任务... + pass + +monitor.stop() +``` + +### Q: 如何获取最新的 GPU 数据? + +A: 在循环监控模式下,访问 `monitor.gpus` 即可获取最新数据: + +```python +monitor.start() +time.sleep(10) +for gpu in monitor.gpus: + print(gpu.temperature) +``` + +### Q: 支持多进程吗? + +A: 支持。每个 GPUMonitor 实例独立运行,可创建多个实例进行监控。 + +### Q: 告警阈值可以动态修改吗? + +A: 可以,修改 `monitor.alert_config` 字典即可: + +```python +monitor.alert_config["temperature"] = 90 # 修改温度告警阈值 +``` + +## 故障排除 + +### 错误:`mthreads-gmi: command not found` + +确保 `mthreads-gmi` 命令已正确安装并在 PATH 中。 + +### 数据为空 + +检查是否有 Mthreads GPU 硬件连接,运行: +```bash +mthreads-gmi -q --json +``` + +### CSV 文件权限问题 + +确保对 CSV 文件路径的目录有写权限。 + +## 许可证 + +MIT + +## 作者 + +wangkang diff --git a/base_test/monitor/README_sh.md b/base_test/monitor/README_sh.md new file mode 100644 index 0000000..3d353f3 --- /dev/null +++ b/base_test/monitor/README_sh.md @@ -0,0 +1,42 @@ +# GPU 监控脚本使用说明 + +该脚本用于定时采集指定 GPU 的温度和图形频率,支持高温/降频报警,并可将结果记录到日志文件中,适用于性能测试与运行状态监控。 + +--- + +## ✅ 脚本功能 + +- 支持指定 GPU 设备编号 +- 可自定义刷新时间间隔和记录次数 +- 实时记录温度(℃)与图形频率(MHz) +- 高温(>95°C)或降频(<1750MHz)触发报警(门限可根据实际设备调整) +- 监控结束后自动统计报警次数 +- 默认输出日志文件为:`gpu_monitor_log.txt` + +--- + +## ⚙️ 参数说明 + +| 参数 | 含义 | 示例 | +|------|--------------------|-----------------| +| `-d` | GPU 设备编号 | `-d 0` | +| `-i` | 刷新时间(单位:秒)| `-i 1` | +| `-n` | 记录次数(默认无限)| `-n 10` | + +> 如未指定 `-n`,脚本将持续运行,直到手动停止。 + +--- + +## 🚀 使用示例 + +### ✅ 前台运行 + +```bash +./monitor_gpu.sh -d 0 -i 1 -n 10 +``` + +### ✅ 后台运行 +``` +nohup ./monitor_gpu.sh -d 0 -i 1 -n 100 > /dev/null 2>&1 & +tail -f gpu_monitor_log.txt +``` diff --git a/base_test/monitor/mthreads_gpu_monitor.py b/base_test/monitor/mthreads_gpu_monitor.py new file mode 100644 index 0000000..b16ca03 --- /dev/null +++ b/base_test/monitor/mthreads_gpu_monitor.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# author: wangkang + +""" +GPU Monitor for MTT (mthreads-gmi) + +Features: +- Periodic GPU info refresh +- CSV logging +- Threshold alerts (temperature / memory) +""" + +import json +import subprocess +import threading +import time +import csv +from datetime import datetime +from typing import List, Dict, Any, Optional, Callable, Union + + +class GPUInfo: + def __init__( + self, + index: int, + model: str, + memory_total: float, + memory_used: float, + utilization: float, + temperature: float, + power: float, + ): + self.index = index + self.model = model + self.memory_total = memory_total + self.memory_used = memory_used + self.utilization = utilization + self.temperature = temperature + self.power = power + + + @property + def memory_used_ratio(self) -> float: + if self.memory_total <= 0: + return 0.0 + return self.memory_used / self.memory_total + + def to_dict(self) -> Dict[str, Any]: + return { + "index": self.index, + "model": self.model, + "memory_total": self.memory_total, + "memory_used": self.memory_used, + "memory_used_ratio": self.memory_used_ratio, + "utilization": self.utilization, + "temperature": self.temperature, + "power": self.power, + } + + def __repr__(self) -> str: + return ( + f"GPUInfo(index={self.index}, model='{self.model}', " + f"util={self.utilization}%, temp={self.temperature}C, " + f"memory_used={self.memory_used}MiB, " + f"memory_total={self.memory_total}MiB, " + f"power={self.power}W)" + ) + + +class GPUMonitor: + def __init__( + self, + refresh_interval: int = 5, + csv_path: Optional[str] = None, + alert_config: Optional[Dict[str, float]] = None, + alert_callback: Optional[Callable[[GPUInfo, str], None]] = None, + ): + """ + refresh_interval: 刷新间隔(秒) + csv_path: CSV 保存路径(None 表示不保存) + alert_config: + { + "temperature": 80, + "memory_used_ratio": 0.9 + } + """ + self.command = ["mthreads-gmi", "-q", "--json"] + self.refresh_interval = refresh_interval + self.csv_path = csv_path + self.alert_config = alert_config or {} + self.alert_callback = alert_callback + + self.gpus: List[GPUInfo] = [] + + self._stop_event = threading.Event() + self._thread: Optional[threading.Thread] = None + + def _extract_float(self, value: Any, unit: str = "") -> float: + if isinstance(value, (int, float)): + return float(value) + return float(str(value).rstrip(unit).strip()) + + def _run_command(self) -> Optional[List[GPUInfo]]: + try: + result = subprocess.run( + self.command, + capture_output=True, + text=True, + timeout=10, + ) + + if result.returncode != 0: + print("mthreads-gmi failed:", result.stderr) + return None + + data = json.loads(result.stdout) + gpus: List[GPUInfo] = [] + + for gpu in data.get("GPU", []): + gpus.append( + GPUInfo( + index=int(gpu.get("Index", -1)), + model=gpu.get("Product Name", "Unknown"), + memory_total=self._extract_float( + gpu.get("FB Memory Usage", {}).get("Total", 0), "MiB" + ), + memory_used=self._extract_float( + gpu.get("FB Memory Usage", {}).get("Used", 0), "MiB" + ), + utilization=self._extract_float( + gpu.get("Utilization", {}).get("Gpu", 0), "%" + ), + temperature=self._extract_float( + gpu.get("Temperature", {}).get("GPU Current Temp", "0C"), "C" + ), + power=self._extract_float( + gpu.get("Power Readings", {}).get("Power Draw ", "0W"), "W" + ), + ) + ) + return gpus + + except Exception as e: + print("GPU query error:", e) + return None + + def update(self): + """更新GPU信息并处理告警和CSV日志""" + gpus = self._run_command() + if gpus: + self.gpus = gpus + self._check_alerts() + if self.csv_path: + self._save_csv() + + def to_dict(self) -> List[Dict[str, Any]]: + """ + 返回所有GPU信息(dict格式) + """ + return [gpu.to_dict() for gpu in self.gpus] + + def get_gpu(self, index: Union[int, List[int]]) -> Optional[GPUInfo]: + """ + 按index获取单张GPU + """ + if isinstance(index, int): + return self.gpus[index] if 0 <= index < len(self.gpus) else None + elif isinstance(index, list): + return [self.gpus[i] for i in index if 0 <= i < len(self.gpus)] + return None + + def start(self): + if self._thread and self._thread.is_alive(): + return + self._stop_event.clear() + self._thread = threading.Thread(target=self._loop, daemon=True) + self._thread.start() + + def stop(self): + self._stop_event.set() + + def _loop(self): + while not self._stop_event.is_set(): + self.update() + time.sleep(self.refresh_interval) + + def _save_csv(self): + file_exists = False + try: + with open(self.csv_path, "r"): + file_exists = True + except FileNotFoundError: + pass + + with open(self.csv_path, "a", newline="") as f: + writer = csv.writer(f) + + if not file_exists: + writer.writerow([ + "timestamp", + "gpu_index", + "model", + "temperature", + "utilization", + "memory_used", + "memory_total", + "power", + ]) + + ts = datetime.now().isoformat() + for gpu in self.gpus: + writer.writerow([ + ts, + gpu.index, + gpu.model, + gpu.temperature, + gpu.utilization, + gpu.memory_used, + gpu.memory_total, + gpu.power, + ]) + + + def _check_alerts(self): + for gpu in self.gpus: + if "temperature" in self.alert_config: + if gpu.temperature >= self.alert_config["temperature"]: + self._alert(gpu, "temperature") + + if "memory_used_ratio" in self.alert_config: + if gpu.memory_used_ratio >= self.alert_config["memory_used_ratio"]: + self._alert(gpu, "memory") + + def _alert(self, gpu: GPUInfo, alert_type: str): + msg = ( + f"[ALERT] GPU {gpu.index} {alert_type} exceeded | " + f"temp={gpu.temperature}C " + f"mem_ratio={gpu.memory_used_ratio:.2f}" + ) + if self.alert_callback: + self.alert_callback(gpu, msg) + else: + print(msg) + + + +if __name__ == "__main__": + + # 方式1: 只读取一次 GPU 信息 + + monitor = GPUMonitor() + monitor.update() # 直接调用 update() 读取一次 + + # 一次性打印所有GPU信息(List[dict]格式) + print("=== 方式1: 只读取一次 GPU 信息 ===") + print("所有 GPU 信息:") + print(monitor.to_dict(), "\n") + + # 打印第0号GPU信息(dict格式) + print("第0号 GPU 信息:") + print(monitor.gpus[0].to_dict(), "\n") + + # 使用 get_gpu 方法获取 GPUInfo 对象, 并打印其属性 + print("第0号 GPU 的 memory_total 属性:") + print(monitor.get_gpu(0).memory_total, "\n") + + # 获取多张GPU信息 + print(monitor.get_gpu([0, 1]), "\n") + + + # # 方式2: 循环监控(每5秒刷新一次) + print("\n=== 方式2: 循环监控 ===") + monitor = GPUMonitor( + refresh_interval=5, + csv_path="gpu_metrics.csv", + alert_config={ + "temperature": 80, + "memory_used_ratio": 0.9, + }, + ) + print("GPU 监控程序已启动...") + print("每 5 秒刷新一次,温度 ≥80°C 或显存占比 ≥90% 时告警") + print("CSV日志保存到: gpu_metrics.csv") + print("按 Ctrl+C 停止监控\n") + + monitor.start() + + time.sleep(30) # 你要运行的程序!!! + + monitor.stop() + + + + +