From 362c0888422cf6f8d1487e439d8c9364cb703f8c Mon Sep 17 00:00:00 2001
From: Wang Kang <kang.wang-EXT@mthreads.com>
Date: Fri, 5 Dec 2025 15:53:40 +0800
Subject: [PATCH 1/4] add matmul scripts

---
 base_test/matmul_test/READE.md                |  57 ++
 .../summarize_f32_f16_bf16_q8_fp8_log.py      | 122 ++++
 .../summarize_fp64_tf32_log.py                | 120 ++++
 .../exetrct_log_tools/summary_mixed_data.py   |  64 ++
 .../fp64_tf32_src/build_gemm_fp64.sh          |   1 +
 .../fp64_tf32_src/build_gemm_tf32.sh          |   1 +
 .../matmul_test/fp64_tf32_src/gemm_fp64.mu    | 122 ++++
 .../matmul_test/fp64_tf32_src/gemm_tf32.cpp   | 678 ++++++++++++++++++
 .../test_gemm_f32_f16_bf16_q8_fp8.sh          |  48 ++
 base_test/matmul_test/test_gemm_fp64_tf32.sh  |  84 +++
 base_test/matmul_test/test_gemm_mixed.sh      |  86 +++
 {script => base_test}/monitor/README.md       |   0
 {script => base_test}/monitor/monitor_gpu.sh  |   0
 13 files changed, 1383 insertions(+)
 create mode 100644 base_test/matmul_test/READE.md
 create mode 100644 base_test/matmul_test/exetrct_log_tools/summarize_f32_f16_bf16_q8_fp8_log.py
 create mode 100644 base_test/matmul_test/exetrct_log_tools/summarize_fp64_tf32_log.py
 create mode 100644 base_test/matmul_test/exetrct_log_tools/summary_mixed_data.py
 create mode 100644 base_test/matmul_test/fp64_tf32_src/build_gemm_fp64.sh
 create mode 100644 base_test/matmul_test/fp64_tf32_src/build_gemm_tf32.sh
 create mode 100644 base_test/matmul_test/fp64_tf32_src/gemm_fp64.mu
 create mode 100644 base_test/matmul_test/fp64_tf32_src/gemm_tf32.cpp
 create mode 100644 base_test/matmul_test/test_gemm_f32_f16_bf16_q8_fp8.sh
 create mode 100644 base_test/matmul_test/test_gemm_fp64_tf32.sh
 create mode 100644 base_test/matmul_test/test_gemm_mixed.sh
 rename {script => base_test}/monitor/README.md (100%)
 rename {script => base_test}/monitor/monitor_gpu.sh (100%)
 mode change 100755 => 100644

diff --git a/base_test/matmul_test/READE.md b/base_test/matmul_test/READE.md
new file mode 100644
index 0000000..c54a072
--- /dev/null
+++ b/base_test/matmul_test/READE.md
@@ -0,0 +1,57 @@
+Matmul 自动化测试脚本
+# 1. 脚本说明
+matmul 存放位置：
+```shell
+mudnn_bench
+├── bench_test_matmul.sh
+├── bin
+│   ├── mudnn_bench -> mudnn_bench-x.x.x
+│   └── mudnn_bench-x.x.x
+├── matmul_test
+```
+mudnn_bench 示例：
+```shell
+
+# 示例 1：单卡，大矩阵，f32
+MUSA_VISIBLE_DEVICES=4 ./bin/mudnn_bench -m --mm_m 6144 --mm_n 3584 --mm_k 6144 --warmup 30 --tm i --tmv 1000 -p -t f32
+
+# 示例 2：多卡，标准尺寸，bf16
+MUSA_VISIBLE_DEVICES=0,1 ./bin/mudnn_bench -m --mm_m 4096 --mm_n 4096 --mm_k 4096 --warmup 30 --tm i --tmv 1000 -p -t bf16
+
+# 示例 3：单卡，特殊组合，int8
+MUSA_VISIBLE_DEVICES=2 ./bin/mudnn_bench -m --mm_m 8192 --mm_n 8192 --mm_k 768 --warmup 30 --tm i --tmv 1000 -p -t int8
+
+# 示例 4：使用混合精度格式
+MUSA_VISIBLE_DEVICES=3 ./bin/mudnn_bench -m --mm_m 2048 --mm_n 2048 --mm_k 2048 --warmup 30 --tm i --tmv 1000 -p -t bf16:q4:bf16:bf16
+```
+
+# 2. 测试
+## 2.1 fp64, tf32 测试
+```shell
+# 1. 编译
+bash ./fp64_tf32_src/build_gemm_tf32.sh
+
+bash ./fp64_tf32_src/build_gemm_fp64.sh
+
+## 2. 测试
+bash test_gemm_fp64_tf32.sh
+```
+
+## 2.2 f32_f16_bf16_q8_fp8 测试
+```shell
+bash test_gemm_f32_f16_bf16_q8_fp8.sh
+```
+
+## 2.3 混合精度测试
+```shell
+# A,B: fp16, C,D: f32: "f16:f16:f32:f32"
+# A,B: bf16, C,D: f32: "bf16:bf16:f32:f32"
+# A,B: tf32, C,D: f32: "f32"
+# A,B: int8, C,D: int32: "int8"
+# W8A8: "q8:q8:f32:f32"
+# W4A16: "bf16:q4:bf16:bf16"
+# A,B: fp8, C,D: fp16: "float8_e4m3:float8_e4m3:f16:f16"
+
+bash test_gemm_mixed.sh
+```
+
diff --git a/base_test/matmul_test/exetrct_log_tools/summarize_f32_f16_bf16_q8_fp8_log.py b/base_test/matmul_test/exetrct_log_tools/summarize_f32_f16_bf16_q8_fp8_log.py
new file mode 100644
index 0000000..95a2a28
--- /dev/null
+++ b/base_test/matmul_test/exetrct_log_tools/summarize_f32_f16_bf16_q8_fp8_log.py
@@ -0,0 +1,122 @@
+import re
+import os
+import sys
+from typing import List, Dict, Optional
+
+def extract_matmul_data(log_path: str) -> List[Dict[str, str]]:
+    patterns = {
+        "datatype": re.compile(r"DataType (\w+)"),
+        "mat_params": re.compile(r"m (\d+), n (\d+), k (\d+)"),
+        "elapsed_time": re.compile(r"AverageElapsedTime\(ms\) : (\d+\.\d+)"),
+        "throughput_gops": re.compile(r"Throughput (\d+\.\d+) GOPS")
+    }
+
+    extracted = []
+    current_block = {}
+
+    try:
+        with open(log_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.strip()
+
+                dt_match = patterns["datatype"].search(line)
+                if dt_match:
+                    current_block["datatype"] = dt_match.group(1)
+
+                mp_match = patterns["mat_params"].search(line)
+                if mp_match:
+                    current_block["m"] = mp_match.group(1)
+                    current_block["n"] = mp_match.group(2)
+                    current_block["k"] = mp_match.group(3)
+
+                et_match = patterns["elapsed_time"].search(line)
+                if et_match:
+                    current_block["elapsed_time"] = et_match.group(1)
+
+                tp_match = patterns["throughput_gops"].search(line)
+                if tp_match:
+                    tops = round(float(tp_match.group(1)) / 1000, 4)
+                    current_block["throughput_tops"] = str(tops)
+
+                if line == "==============================" and current_block:
+                    required = ["datatype", "m", "n", "k", "elapsed_time", "throughput_tops"]
+                    if all(key in current_block for key in required):
+                        dim = f"{current_block['m']}-{current_block['n']}-{current_block['k']}"
+                        extracted.append({
+                            "datatype": current_block["datatype"],
+                            "shape": dim,
+                            "Throughput(TOPS)": current_block["throughput_tops"],
+                            "AverageElapsedTime(ms)": current_block["elapsed_time"]
+                        })
+                    current_block = {}
+
+        required = ["datatype", "m", "n", "k", "elapsed_time", "throughput_tops"]
+        if current_block and all(key in current_block for key in required):
+            dim = f"{current_block['m']}×{current_block['n']}×{current_block['k']}"
+            extracted.append({
+                "datatype": current_block["datatype"],
+                "shape": dim,
+                "Throughput(TOPS)": current_block["throughput_tops"],
+                "AverageElapsedTime(ms)": current_block["elapsed_time"]
+            })
+
+    except Exception as e:
+        print(f"❌ 读取日志失败：{str(e)}")
+        return []
+
+    return extracted
+
+def generate_csv(data: List[Dict[str, str]], output_path: str) -> bool:
+    if not data:
+        print("⚠️  未提取到有效数据，跳过CSV生成")
+        return False
+
+    headers = ["datatype", "shape", "Throughput(TOPS)", "AverageElapsedTime(ms)"]
+
+    try:
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(", ".join(headers) + "\n")
+            for item in data:
+                row = [item[h] for h in headers]
+                f.write(", ".join(row) + "\n")
+        print(f"✅ CSV生成成功：{output_path}")
+        return True
+    except Exception as e:
+        print(f"❌ 生成CSV失败：{str(e)}")
+        return False
+
+def main(input_log: str, output_csv: Optional[str] = None):
+    if not os.path.isfile(input_log):
+        print(f"❌ 输入日志文件不存在：{input_log}")
+        return
+
+    if not output_csv:
+        log_dir = os.path.dirname(input_log)
+        log_name = os.path.splitext(os.path.basename(input_log))[0]
+        output_csv = os.path.join(log_dir, f"{log_name}_summary.csv")
+
+    print(f"📊 开始提取日志数据：{input_log}")
+    matmul_data = extract_matmul_data(input_log)
+
+    if not matmul_data:
+        print("❌ 未提取到任何有效测试数据")
+        return
+
+    print(f"✅ 成功提取 {len(matmul_data)} 条测试记录")
+
+    generate_csv(matmul_data, output_csv)
+    print("🎯 所有操作完成！")
+
+if __name__ == "__main__":
+    # 修正sys.argv判断（sys.argv[0]是脚本名，需至少传入1个输入文件路径）
+    if len(sys.argv) < 2:
+        print("用法：")
+        print("  python summarize_fp64_tf32_log.py <输入日志文件路径>")
+        print("示例：")
+        print("  python summarize_fp64_tf32_log.py bench.log")
+        sys.exit(1)
+
+    input_path = sys.argv[1]
+    output_path = sys.argv[1].replace('.log', '.csv')  # 日志文件同名CSV输出
+    main(input_path, output_path)
+
diff --git a/base_test/matmul_test/exetrct_log_tools/summarize_fp64_tf32_log.py b/base_test/matmul_test/exetrct_log_tools/summarize_fp64_tf32_log.py
new file mode 100644
index 0000000..31dacbd
--- /dev/null
+++ b/base_test/matmul_test/exetrct_log_tools/summarize_fp64_tf32_log.py
@@ -0,0 +1,120 @@
+import re
+import sys
+import os
+from typing import List, Dict, Optional
+
+def extract_matmul_data(log_path: str) -> List[Dict[str, str]]:
+    patterns = {
+        "datatype": re.compile(r"MatMul (\w+) Test \(MUSA\)"),
+        "mat_params": re.compile(r"m = (\d+), n = (\d+), k = (\d+)"),
+        "duration_us": re.compile(r"Duration:(\s*[\d\.]+) us"),
+        "tflops": re.compile(r"computation-\w+=(\s*[\d\.]+)")
+    }
+
+    extracted = []
+    current_block = {}
+
+    try:
+        with open(log_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.strip()
+
+                dt_match = patterns["datatype"].search(line)
+                if dt_match:
+                    current_block["datatype"] = dt_match.group(1)
+
+                mp_match = patterns["mat_params"].search(line)
+                if mp_match:
+                    current_block["m"] = mp_match.group(1)
+                    current_block["n"] = mp_match.group(2)
+                    current_block["k"] = mp_match.group(3)
+
+                dur_match = patterns["duration_us"].search(line)
+                if dur_match:
+                    us_val = float(dur_match.group(1).strip())
+                    ms_val = round(us_val / 1000, 6)
+                    current_block["duration_ms"] = str(ms_val)
+
+                tf_match = patterns["tflops"].search(line)
+                if tf_match:
+                    tf_val = tf_match.group(1).strip()
+                    current_block["tflops"] = str(round(float(tf_val), 6))
+
+                if line == "========================================" and current_block:
+                    required = ["datatype", "m", "n", "k", "duration_ms", "tflops"]
+                    if all(key in current_block for key in required):
+                        shape = f"{current_block['m']}-{current_block['n']}-{current_block['k']}"
+                        extracted.append({
+                            "DataType": current_block["datatype"],
+                            "shape": shape,
+                            "Compute_ability(TFLOPS)": current_block["tflops"],
+                            "AverageElapsedTime(ms)": current_block["duration_ms"]
+                        })
+                    current_block = {}
+
+        required = ["datatype", "m", "n", "k", "duration_ms", "tflops"]
+        if current_block and all(key in current_block for key in required):
+            shape = f"{current_block['m']}-{current_block['n']}-{current_block['k']}"
+            extracted.append({
+                "DataType": current_block["datatype"],
+                "shape": shape,
+                "Compute_ability(TFLOPS)": current_block["tflops"],
+                "AverageElapsedTime(ms)": current_block["duration_ms"]
+            })
+
+    except Exception as e:
+        print(f"❌ 读取日志失败：{str(e)}")
+        return []
+
+    return extracted
+
+def generate_csv(data: List[Dict[str, str]], output_path: str) -> bool:
+    if not data:
+        print("⚠️  未提取到有效数据，跳过CSV生成")
+        return False
+
+    headers = ["DataType", "shape", "Compute_ability(TFLOPS)", "AverageElapsedTime(ms)"]
+    try:
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(", ".join(headers) + "\n")
+            for item in data:
+                row = [item[h] for h in headers]
+                f.write(",".join(row) + "\n")
+        print(f"✅ CSV生成成功：{output_path}")
+        return True
+    except Exception as e:
+        print(f"❌ 生成CSV失败：{str(e)}")
+        return False
+
+def main(input_log: str, output_csv: Optional[str] = None):
+    if not os.path.isfile(input_log):
+        print(f"❌ 输入日志文件不存在：{input_log}")
+        return
+
+    if not output_csv:
+        log_dir = os.path.dirname(input_log)
+        log_name = os.path.splitext(os.path.basename(input_log))[0]
+        output_csv = os.path.join(log_dir, f"{log_name}_summary.csv")
+
+    print(f"📊 开始提取日志数据：{input_log}")
+    matmul_data = extract_matmul_data(input_log)
+
+    if not matmul_data:
+        print("❌ 未提取到任何有效测试数据")
+        return
+
+    print(f"✅ 成功提取 {len(matmul_data)} 条测试记录")
+    generate_csv(matmul_data, output_csv)
+    print("🎯 所有操作完成！")
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("用法：")
+        print("  python summarize_fp64_tf32_log.py <输入日志文件路径>")
+        print("示例：")
+        print("  python summarize_fp64_tf32_log.py bench.log")
+        sys.exit(1)
+
+    input_path = sys.argv[1]
+    output_path = sys.argv[1].replace('.log', '.csv')
+    main(input_path, output_path)
diff --git a/base_test/matmul_test/exetrct_log_tools/summary_mixed_data.py b/base_test/matmul_test/exetrct_log_tools/summary_mixed_data.py
new file mode 100644
index 0000000..5edfa15
--- /dev/null
+++ b/base_test/matmul_test/exetrct_log_tools/summary_mixed_data.py
@@ -0,0 +1,64 @@
+import re
+import sys
+import csv
+import os
+
+if len(sys.argv) < 2:
+    print("Usage: python summary_fix_data.py <log_file>")
+    sys.exit(1)
+
+log_file = sys.argv[1]
+print(f"📊 正在读取并解析日志：{log_file}")
+
+if not os.path.exists(log_file):
+    print("❌ 日志文件不存在")
+    sys.exit(1)
+
+# 收集结果
+records = []
+
+# 正则模式
+re_start = re.compile(r"测试:\s*M=(\d+),\s*N=(\d+),\s*K=(\d+),\s*Type=([\w:]+)")
+re_result = re.compile(r"AverageElapsedTime\(ms\)\s*:\s*([\d\.]+)\s*,\s*Throughput\s*([\d\.]+)\s*GOPS")
+
+cur_M = cur_N = cur_K = cur_type = None
+
+with open(log_file, "r", encoding="utf-8") as f:
+    for line in f:
+        line = line.strip()
+
+        # 匹配开始参数
+        m1 = re_start.search(line)
+        if m1:
+            cur_M, cur_N, cur_K, cur_type = m1.groups()
+            continue
+
+        # 匹配结果
+        m2 = re_result.search(line)
+        if m2 and cur_M is not None:
+            elapsed, gops = m2.groups()
+            records.append({
+                "M": cur_M,
+                "N": cur_N,
+                "K": cur_K,
+                "Type": cur_type,
+                "AvgTime(ms)": elapsed,
+                "GOPS": gops
+            })
+            # 清空当前块（防止串行）
+            cur_M = cur_N = cur_K = cur_type = None
+
+# 输出 CSV
+if not records:
+    print("⚠️ 未提取到任何有效数据")
+    sys.exit(0)
+
+csv_path = log_file.replace(".log", ".csv")
+with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
+    writer = csv.DictWriter(csvfile, fieldnames=records[0].keys())
+    writer.writeheader()
+    writer.writerows(records)
+
+print(f"✅ 解析完成，共 {len(records)} 条数据")
+print(f"📄 CSV 已生成：{csv_path}")
+
diff --git a/base_test/matmul_test/fp64_tf32_src/build_gemm_fp64.sh b/base_test/matmul_test/fp64_tf32_src/build_gemm_fp64.sh
new file mode 100644
index 0000000..4d33fd1
--- /dev/null
+++ b/base_test/matmul_test/fp64_tf32_src/build_gemm_fp64.sh
@@ -0,0 +1 @@
+mcc gemm_fp64.mu -lmusart -lmublas -o gemm_fp64 --offload-arch=mp_31
diff --git a/base_test/matmul_test/fp64_tf32_src/build_gemm_tf32.sh b/base_test/matmul_test/fp64_tf32_src/build_gemm_tf32.sh
new file mode 100644
index 0000000..83b5acb
--- /dev/null
+++ b/base_test/matmul_test/fp64_tf32_src/build_gemm_tf32.sh
@@ -0,0 +1 @@
+g++ gemm_tf32.cpp -std=c++17 -I/usr/local/musa/include -L /usr/local/musa/lib/ -fopenmp -lmudnn -lmusart -o gemm_tf32 -O2
diff --git a/base_test/matmul_test/fp64_tf32_src/gemm_fp64.mu b/base_test/matmul_test/fp64_tf32_src/gemm_fp64.mu
new file mode 100644
index 0000000..ac62c9e
--- /dev/null
+++ b/base_test/matmul_test/fp64_tf32_src/gemm_fp64.mu
@@ -0,0 +1,122 @@
+#include <chrono>
+#include <iostream>
+#include <mublas.h>
+#include <musa_runtime.h>
+#include <vector>
+
+size_t M = 16384;
+size_t N = 16384;
+size_t K = 16384;
+
+struct PrecisionConfig
+{
+  int bytesPerElement;
+  const char *name;
+  int NUM_ITERATIONS;
+  int WARMUP_ITERATIONS = 10;
+};
+
+void test(const PrecisionConfig &config)
+{
+  double *d_A, *d_B, *d_C;
+  std::vector<double> h_A(M * K, double(0.9f));
+  std::vector<double> h_B(K * N, double(1.2f));
+  std::vector<double> h_C(M * N);
+
+  musaMalloc(&d_A, M * K * config.bytesPerElement);
+  musaMalloc(&d_B, K * N * config.bytesPerElement);
+  musaMalloc(&d_C, M * N * config.bytesPerElement);
+
+  musaMemcpy(d_A, h_A.data(), M * K * config.bytesPerElement, musaMemcpyHostToDevice);
+  musaMemcpy(d_B, h_B.data(), K * N * config.bytesPerElement, musaMemcpyHostToDevice);
+
+  mublasHandle_t handle;
+  mublasCreate(&handle);
+
+  double alpha = 1.0f;
+  double beta = 0.0f;
+
+  for (int i = 0; i < config.WARMUP_ITERATIONS; ++i)
+  {
+    mublasDgemm(handle, MUBLAS_OP_N, MUBLAS_OP_T,
+                M, N, K, &alpha,
+                d_A, M,
+                d_B, N,
+                &beta,
+                d_C, M);
+  }
+
+  musaError_t syncError = musaDeviceSynchronize();
+  auto start = std::chrono::high_resolution_clock::now();
+
+  if (syncError != musaSuccess)
+  {
+    std::cout << "MUSA error: " << musaGetErrorString(syncError) << std::endl;
+  }
+
+  for (int i = 0; i < config.NUM_ITERATIONS; ++i)
+  {
+    mublasDgemm(handle, MUBLAS_OP_N, MUBLAS_OP_T,
+                M, N, K, &alpha,
+                d_A, M,
+                d_B, N,
+                &beta,
+                d_C, M);
+  }
+  syncError = musaDeviceSynchronize();
+  auto end = std::chrono::high_resolution_clock::now();
+
+  if (syncError != musaSuccess)
+  {
+    std::cout << "MUSA error: " << musaGetErrorString(syncError) << std::endl;
+  }
+  auto duration =
+      std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+  std::cout << "Average " << config.name << " Single Op Duration: "
+            << duration.count() / config.NUM_ITERATIONS << " us" << std::endl;
+
+  double time_second = duration.count() / 1.0e6;
+  double flops = 2.0 * M * N * K * config.NUM_ITERATIONS;
+  double FLOPS = flops / time_second;
+  double TFLOPS = FLOPS / 1.0e12;
+
+  std::cout << "[FlagPerf Result]" << "computation-FP64=" << TFLOPS << "TFLOPS"
+            << std::endl;
+
+  musaMemcpy(h_C.data(), d_C, M * N * config.bytesPerElement, musaMemcpyDeviceToHost);
+
+  musaFree(d_A);
+  musaFree(d_B);
+  musaFree(d_C);
+
+  mublasDestroy(handle);
+}
+
+int main(int argc, char* argv[]) {
+
+  if (argc != 5) {
+      std::cerr << "Usage: " << argv[0] << " <m> <n> <k> <iter>" << std::endl;
+      std::cerr << "Example: " << argv[0] << " 128 128 128 10" << std::endl;
+      return EXIT_FAILURE;
+  }
+
+  int m = std::atoi(argv[1]);
+  int n = std::atoi(argv[2]);
+  int k = std::atoi(argv[3]);
+  int iter = std::atoi(argv[4]);
+
+  std::cout << "========================================" << std::endl;
+  std::cout << "MatMul FP64 Test (MUSA)" << std::endl;
+  std::cout << "m = " << m << ", n = " << n << ", k = " << k << std::endl;
+  std::cout << "Test Iterations = " << iter << std::endl;
+
+  M = m;
+  N = n;
+  K = k;
+  musaSetDevice(0);
+  PrecisionConfig fp64_PrecisionConfig = {sizeof(double), "FP64", iter, 40};
+
+  test(fp64_PrecisionConfig);
+
+  return 0;
+}
diff --git a/base_test/matmul_test/fp64_tf32_src/gemm_tf32.cpp b/base_test/matmul_test/fp64_tf32_src/gemm_tf32.cpp
new file mode 100644
index 0000000..6221eed
--- /dev/null
+++ b/base_test/matmul_test/fp64_tf32_src/gemm_tf32.cpp
@@ -0,0 +1,678 @@
+/* Copyright @2020-2024 Moore Threads Technology Co., Ltd("Moore Threads"). All
+ * rights reserved.
+ *
+ * This software ("this software and its documentations" or "the software") is
+ * protected by Copyright and the information contained herein is confidential.
+ *
+ * The software contained herein is PROPRIETARY to Moore Threads and is being
+ * provided under the terms and conditions of a form of Moore Threads software
+ * license agreement by and between Moore Threads and Licensee ("License
+ * Agreement") or electronically accepted by Licensee. Notwithstanding any
+ * terms or conditions to the contrary in the License Agreement, copy or
+ * disclosure of the software to any third party without the express written
+ * consent of Moore Threads is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE
+ * AGREEMENT, MOORE THREADS MAKES NO REPRESENTATION ABOUT ANY WARRANTIES,
+ * INCLUDING BUT NOT LIMITED TO THE SUITABILITY OF THE SOFTWARE FOR ANY
+ * PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF
+ * ANY KIND. MOORE THREADS DISCLAIMS ALL WARRANTIES WITH REGARD TO THE
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL MOORE THREADS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THE SOFTWARE.
+ */
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+
+#include <chrono>
+#include <map>
+#include <thread>
+#include <type_traits>
+#include <typeinfo>
+
+#include <iostream>
+#include <mudnn.h>
+#include <cstring>
+#include <random>
+
+#include <eigen3/Eigen/Core>
+
+using qint8 = int8_t;
+
+#define SHOW printf
+
+namespace Eigen {
+    struct half;
+    struct bfloat16;
+}
+using Eigen::bfloat16;
+using Eigen::half;
+
+
+struct MatMulParam {
+    bool split_k{ false };
+    bool trans_a{ false };
+    bool trans_b{ true };
+    int batch{ 1 };
+    int m{ 6144 };
+    int n{ 8192 };
+    int k{ 19200 };
+    double alpha{ 1.0 };
+    double beta{ 0.0 };
+    double gamma{ 0.0 };
+    int mode{ 0 }; // 0 tensor, 1 scalar
+};
+
+#define CHECK_MUSA(...)                                                        \
+  do {                                                                         \
+    int err = CheckMusaError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__);   \
+    if (err)                                                                   \
+      exit(err);                                                               \
+  } while (0)
+
+#define CHECK_ERR(...)                                                         \
+  do {                                                                         \
+    int err = CheckError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__);       \
+    if (err)                                                                   \
+      exit(err);                                                               \
+  } while (0)
+
+int CheckMusaError(musaError_t code, const char* expr, const char* file,
+    int line) {
+    if (code) {
+        printf("MUSA error at %s:%d, code=%d (%s) in '%s'", file, line, (int)code,
+            musaGetErrorString(code), expr);
+        return 1;
+    }
+    return 0;
+}
+
+int CheckError(bool code, const char* expr, const char* file, int line) {
+    if (code) {
+        printf("General error at %s:%d, code=%d (%s) in '%s'", file, line,
+            (int)code, "general error", expr);
+        return 1;
+    }
+    return 0;
+}
+
+template <typename Type, typename RandomType>
+void GenerateRandom(Type* data, int64_t size, uint seed = 2333) {
+    // non-determistic seed source
+    // std::random_device rd;
+    // std::default_random_engine engine(rd());
+    // constexpr auto seed = 2333;
+    std::default_random_engine engine(seed);
+    if (std::is_floating_point_v<RandomType>) {
+        std::uniform_real_distribution<float> dist(0, 0);
+        for (auto i = 0; i < size; i++) {
+            data[i] = (Type)(dist(engine));
+        }
+    }
+    else {
+        std::uniform_int_distribution<int8_t> dist(0, 0);
+        for (auto i = 0; i < size; i++) {
+            data[i] = (Type)(dist(engine));
+        }
+    }
+}
+
+void MemFree(void* ptr) {
+    if (ptr) {
+        musaFree(ptr);
+    }
+}
+
+::musa::dnn::MemoryHandler MemoryFunc(size_t size) {
+    void* data = nullptr;
+    if (size) {
+        musaMalloc(&data, size);
+        musaMemset(data, 0, size);
+    }
+    return ::musa::dnn::MemoryHandler(data, MemFree);
+}
+
+enum DType {
+    f32,
+    f16,
+    q8,
+    bf16,
+};
+
+
+class TestMatMul {
+public:
+    inline float F32MaskFormatTF32(float f) {
+        unsigned int t = 0;
+        std::memcpy(&t, &f, sizeof(f));
+        // 1110 0000 0000 0000
+        t = t & 0xffffe000;
+        std::memcpy(&f, &t, sizeof(f));
+        return f;
+    }
+
+    // Random num generator
+
+
+    TestMatMul(const musaStream_t& _stream, const int _device_id, const DType _dtype, const MatMulParam _param, const int _iters)
+    {
+        stream = _stream;
+        device_id = _device_id;
+        dtype = _dtype;
+        dtype_size = 4;
+
+        switch (dtype) {
+            case DType::f32:
+                dtype_str = "float32";
+                dtype_size = 4;
+                break;
+            case DType::f16:
+                dtype_str = "float16";
+                dtype_size = 2;
+                break;
+            case DType::bf16:
+                dtype_str = "bfloat16";
+                dtype_size = 2;
+                break;
+            case DType::q8:
+                dtype_str = "qint8";
+                dtype_size = 1;
+                break;
+            default:
+                bool DType_Not_Suppoted = true;
+                CHECK_ERR(DType_Not_Suppoted);
+                break;
+        }
+        split_k = _param.split_k;
+        trans_a = _param.trans_a;
+        trans_b = _param.trans_b;
+        batch = _param.batch;
+        m = _param.m;
+        n = _param.n;
+        k = _param.k;
+        alpha = _param.alpha;
+        beta = _param.beta;
+        gamma = _param.gamma;
+        mode = _param.mode;
+
+        iters = _iters;
+
+        handle = new ::musa::dnn::Handle(device_id);
+        handle->SetStream(stream);
+    };
+    ~TestMatMul() {
+#define FREE_H(_PTR)                                                           \
+  if (_PTR != nullptr) {                                                       \
+    operator delete(_PTR);                                                     \
+  }
+#define FREE_D(_PTR)                                                           \
+  if (_PTR != nullptr) {                                                       \
+    CHECK_MUSA(musaFree(_PTR));                                                \
+  }
+
+        FREE_H(h_buf_a);
+        FREE_H(h_buf_b);
+        FREE_H(h_buf_c);
+        FREE_H(h_buf_o);
+        FREE_H(h_buf_z);
+
+        FREE_D(d_a);
+        FREE_D(d_b);
+        FREE_D(d_c);
+        FREE_D(d_z);
+
+        FREE_D(d_base);
+        FREE_D(d_bool);
+        FREE_D(d_nonz);
+        FREE_H(h_nonz);
+
+#undef FREE_H
+#undef FREE_D
+
+        if (handle) {
+            delete handle;
+        }
+    };
+
+    bool Test() {
+        // check parameters
+        CheckParams();
+        // initial memory && dnn tensor op
+        Init();
+        // warm up && prepare base golden
+        int warmup_iters = 40;
+        for (int i = 0; i < warmup_iters; i++) {
+            Exec();
+        }
+        // main loop
+        float elapsed_ms = 0.f;
+        musaEvent_t start, stop;
+        if (performance) {
+            CHECK_MUSA(musaEventCreate(&start));
+            CHECK_MUSA(musaEventCreate(&stop));
+            CHECK_MUSA(musaEventRecord(start, stream));
+        }
+
+        std::chrono::milliseconds bubble_time(bubble);
+        std::chrono::milliseconds duration_time(duration);
+        std::chrono::milliseconds show_gap_time(60000);
+        int show_gap_count = 0;
+        auto start_time = std::chrono::steady_clock::now();
+        auto current_time = start_time;
+        const bool blocking = (bubble > 0) || (iters == 0 && duration > 0);
+        int stable_check_gap_count = 1;
+        int run_iters_count = 0;
+        int i = 0;
+        while ((iters > 0 && i < iters) ||
+            (iters == 0 && (current_time - start_time) <= duration_time)) {
+            // operator running
+            Exec(blocking);
+
+            if (bubble > 0) {
+                // SHOW("sleeping %d ms\n", bubble);
+                std::this_thread::sleep_for(bubble_time);
+            }
+            current_time = std::chrono::steady_clock::now();
+            if ((iters == 0 && duration > 0) &&
+                (current_time - start_time) > show_gap_time * show_gap_count) {
+                std::cout << "--- now execution time passed "
+                    << (show_gap_time * show_gap_count).count() << std::endl;
+                show_gap_count++;
+            }
+            // SHOW("run loop %d\n", run_iters_count);
+            i++, stable_check_gap_count++, run_iters_count++;
+        }
+        // performance testing and stability checking are mutually exclusive
+        if (performance) {
+            CHECK_MUSA(musaEventRecord(stop, stream));
+            CHECK_MUSA(musaEventSynchronize(stop));
+            CHECK_MUSA(musaEventElapsedTime(&elapsed_ms, start, stop));
+            elapsed_ms = elapsed_ms / run_iters_count;
+            ShowPerformance(elapsed_ms, (size_t)m * n * k * 2 / elapsed_ms * 1e-6,
+                !stable_check);
+            CHECK_MUSA(musaEventDestroy(start));
+            CHECK_MUSA(musaEventDestroy(stop));
+        }
+        return true;
+    }
+
+    void ShowPerformance(float t, float gops, bool credible) {
+        // SHOW("dev_time : %f, gops : %f %s\n", t, credible ? gops : 0.f,
+        //     credible
+        //     ? " "
+        //     : " - the performance is not credible when enable stable checking");
+        SHOW("Average TF32 Single Op Duration:%f us\n", t * 1.0e3);
+        SHOW("[FlagPerf Result]computation-TF32=%f TFLOPS\n", gops / 1.0e3);
+
+    }
+
+private:
+    void* h_buf_a = nullptr;
+    void* h_buf_b = nullptr;
+    void* h_buf_c = nullptr;
+    void* h_buf_o = nullptr;
+    void* h_buf_z = nullptr;
+
+    void* d_a = nullptr;
+    void* d_b = nullptr;
+    void* d_c = nullptr;
+    void* d_z = nullptr;
+
+    void* d_base = nullptr;
+    void* d_bool = nullptr;
+    void* d_nonz = nullptr;
+    int64_t* h_nonz = nullptr;
+
+    bool result_check = false;
+    bool stable_check = false;
+    bool stable_check_gpu = false;
+    bool performance = true;
+    bool verbose = false;
+    int iters = 1;
+    int duration = 0;
+    int bubble = 0;
+    int gap = 1;
+    uint seed = 2333;
+
+    DType dtype = DType::f32;
+    std::string dtype_str = "float32";
+    size_t dtype_size = 4;
+    bool split_k = false;
+    bool trans_a = false;
+    bool trans_b = false;
+    int batch = 1;
+    int m = 1;
+    int n = 1;
+    int k = 1;
+    double alpha = 1.0;
+    double beta = 0.0;
+    double gamma = 0.0;
+    int mode = 0;
+
+    // qint8 variables
+    const float scale_a = 1.f / 32.f;
+    const float scale_b = 1.f / 32.f;
+    const float scale_c = 32.f;
+
+    // mudnn variables
+    musaStream_t stream;
+    int device_id;
+    ::musa::dnn::Handle* handle;
+    ::musa::dnn::MatMul op;
+
+    ::musa::dnn::Tensor tensor_a;
+    ::musa::dnn::Tensor tensor_b;
+    ::musa::dnn::Tensor tensor_c;
+    ::musa::dnn::Tensor tensor_z;
+    ::musa::dnn::Tensor tensor_base;
+    ::musa::dnn::Tensor tensor_bool;
+    ::musa::dnn::Tensor tensor_nonz;
+
+private:
+
+
+    ::musa::dnn::Tensor::Type GetmuDNNType(const std::string& dtype) {
+        using T = ::musa::dnn::Tensor::Type;
+        static std::map<std::string, T> type_mapping = {
+            {"int8", T::INT8},
+            {"int16", T::INT16},
+            {"int32", T::INT32},
+
+            {"int", T::INT64},
+            {"int64", T::INT64},
+
+            {"uint8", T::UINT8},
+            {"uint16", T::UINT16},
+            {"uint32", T::UINT32},
+
+            {"uint", T::UINT64},
+            {"uint64", T::UINT64},
+
+            {"half", T::HALF},
+            {"float16", T::HALF},
+            {"bfloat16", T::BFLOAT16},
+
+            {"float32", T::FLOAT},
+            {"qint8", T::QINT8},
+
+            {"float", T::FLOAT},
+            {"float64", T::DOUBLE},
+            {"double", T::DOUBLE},
+
+            {"bool", T::BOOL},
+        };
+        if (type_mapping.find(dtype) != type_mapping.end()) {
+            return type_mapping.at(dtype);
+        }
+        else {
+            std::cerr << "GetmuDNNType error : " << dtype << std::endl;
+            return type_mapping.at("float");
+        }
+    }
+    bool CheckParams() {
+        bool pass = true;
+        // param checking
+        if (mode != 0 && mode != 1) {
+            std::cerr << "MatMul mode setting error, fallback 0" << std::endl;
+            mode = 0;
+        }
+        if (m <= 0 || n <= 0 || k <= 0) {
+            std::cerr << "MatMul param setting error, fallback 1" << std::endl;
+            m = m > 0 ? m : 1;
+            n = n > 0 ? n : 1;
+            k = k > 0 ? k : 1;
+        }
+        if (gamma != 0) {
+            std::cerr << "MatMul unsupported gamma != 0 temporarily, fallback 0"
+                << std::endl;
+            gamma = 0;
+        }
+        if (beta != 0) {
+            if (mode == 0) {
+                std::cerr << "MatMul unsupported beta != 0 when mode == 0, fallback 0"
+                    << std::endl;
+                beta = 0;
+            }
+
+        }
+        if (dtype == DType::q8) {
+            // To be removed when binary supports QINT8
+            if (stable_check_gpu) {
+                std::cerr
+                    << "MatMul unsupported qint8 for stable_check_gpu, fallback cpu "
+                    << std::endl;
+                stable_check_gpu = false;
+            }
+            if (mode != 0) {
+                std::cerr << "MatMul mode must be 0 when qint8, fallback 0"
+                    << std::endl;
+                mode = 0;
+            }
+        }
+
+        return pass;
+    }
+
+    bool Init() {
+        size_t nr_elem_a = (size_t)(m)*k;
+        size_t nr_elem_b = (size_t)(k)*n;
+        size_t nr_elem_c = (size_t)(m)*n;
+        size_t nr_elem_z = (size_t)(n);
+
+        size_t size_a = nr_elem_a * dtype_size;
+        size_t size_b = nr_elem_b * dtype_size;
+        size_t size_c = nr_elem_c * dtype_size;
+        size_t size_z = nr_elem_z * dtype_size;
+
+        size_t mem_total, mem_free;
+        CHECK_MUSA(musaMemGetInfo(&mem_free, &mem_total));
+        size_t available_gpu_mem = mem_free;
+        size_t total_gpu_mem = mem_total;
+        size_t need_gpu_mem = size_a + size_b + size_c;
+        if (gamma != 0) {
+            need_gpu_mem += size_z;
+        }
+        if (stable_check && stable_check_gpu) {
+            need_gpu_mem +=
+                size_c + sizeof(bool) * nr_elem_c + sizeof(int64_t) * m * n * 2;
+        }
+        if ((need_gpu_mem > available_gpu_mem) || verbose) {
+            SHOW("%s : Need Device Memory %.2f GiB, Available Device Memory %.2f GiB "
+                "(Total %.2f GiB)\n",
+                (need_gpu_mem > available_gpu_mem) ? "Error" : "Verbose",
+                need_gpu_mem / 1024.f / 1024 / 1024,
+                available_gpu_mem / 1024.f / 1024 / 1024,
+                total_gpu_mem / 1024.f / 1024 / 1024);
+        }
+        CHECK_ERR(need_gpu_mem > available_gpu_mem);
+
+        // host buffer
+        h_buf_a = operator new(size_a); // new char[size_a]();
+        h_buf_b = operator new(size_b); // new char[size_b]();
+        h_buf_c = operator new(size_c); // new char[size_c]();
+        h_buf_o = operator new(size_c); // new char[size_c]();
+
+        // host data initialization
+        if (dtype == DType::f16) {
+            GenerateRandom<half, float>((half*)(h_buf_a), nr_elem_a, seed);
+            GenerateRandom<half, float>((half*)(h_buf_b), nr_elem_b, seed);
+            GenerateRandom<half, float>((half*)(h_buf_c), nr_elem_c, seed);
+
+        }
+        else if (dtype == DType::bf16) {
+            GenerateRandom<bfloat16, float>((bfloat16*)(h_buf_a), nr_elem_a, seed);
+            GenerateRandom<bfloat16, float>((bfloat16*)(h_buf_b), nr_elem_b, seed);
+            GenerateRandom<bfloat16, float>((bfloat16*)(h_buf_c), nr_elem_c, seed);
+        }
+        else if (dtype == DType::q8) {
+            GenerateRandom<qint8, qint8>((qint8*)(h_buf_a), nr_elem_a, seed);
+            GenerateRandom<qint8, qint8>((qint8*)(h_buf_b), nr_elem_b, seed);
+            GenerateRandom<qint8, qint8>((qint8*)(h_buf_c), nr_elem_c, seed);
+        }
+        else {
+            GenerateRandom<float, float>((float*)(h_buf_a), nr_elem_a, seed);
+            GenerateRandom<float, float>((float*)(h_buf_b), nr_elem_b, seed);
+            GenerateRandom<float, float>((float*)(h_buf_c), nr_elem_c, seed);
+        }
+
+        // tensor float 32 format
+        if ((dtype == DType::f32) && mode == 0) {
+            for (size_t i = 0; i < nr_elem_a; i++) {
+                ((float*)h_buf_a)[i] = (float)F32MaskFormatTF32(((float*)h_buf_a)[i]);
+            }
+            for (size_t i = 0; i < nr_elem_b; i++) {
+                ((float*)h_buf_b)[i] = (float)F32MaskFormatTF32(((float*)h_buf_b)[i]);
+            }
+            for (size_t i = 0; i < nr_elem_c; i++) {
+                ((float*)h_buf_c)[i] = (float)F32MaskFormatTF32(((float*)h_buf_c)[i]);
+            }
+        }
+
+        // device buffer
+        CHECK_MUSA(musaMalloc(&d_a, size_a));
+        CHECK_MUSA(musaMalloc(&d_b, size_b));
+        CHECK_MUSA(musaMalloc(&d_c, size_c));
+
+        // transfer host data to device
+
+        CHECK_MUSA(musaMemcpy(d_a, h_buf_a, size_a, musaMemcpyHostToDevice));
+        CHECK_MUSA(musaMemcpy(d_b, h_buf_b, size_b, musaMemcpyHostToDevice));
+        CHECK_MUSA(musaMemcpy(d_c, h_buf_c, size_c, musaMemcpyHostToDevice));
+
+        // host and device buffer for gamma 
+        if (gamma != 0) {
+            h_buf_z = new char[size_z]();
+            CHECK_MUSA(musaMalloc(&d_z, size_z));
+            CHECK_MUSA(musaMemcpy(d_z, h_buf_z, size_z, musaMemcpyHostToDevice));
+            if (dtype == DType::f16) {
+                GenerateRandom<half, float>((half*)(h_buf_z), nr_elem_z, seed);
+            }
+            else if (dtype == DType::bf16) {
+                GenerateRandom<bfloat16, float>((bfloat16*)(h_buf_z), nr_elem_z, seed);
+            }
+            else if (dtype == DType::q8) {
+                GenerateRandom<qint8, qint8>((qint8*)(h_buf_z), nr_elem_z, seed);
+            }
+            else {
+                GenerateRandom<float, float>((float*)(h_buf_z), nr_elem_z, seed);
+            }
+        }
+
+
+        ::musa::dnn::Tensor::Type ttype = GetmuDNNType(dtype_str);
+        tensor_a.SetAddr(d_a);
+        tensor_a.SetType(ttype);
+        if (DType::q8 == dtype) {
+            tensor_a.SetQuantizationInfo(scale_a);
+        }
+        if (trans_a) {
+            tensor_a.SetNdInfo({ k, m });
+        }
+        else {
+            tensor_a.SetNdInfo({ m, k });
+        }
+
+        tensor_b.SetAddr(d_b);
+        tensor_b.SetType(ttype);
+        if (DType::q8 == dtype) {
+            tensor_b.SetQuantizationInfo(scale_b);
+        }
+        if (trans_b) {
+            tensor_b.SetNdInfo({ n, k });
+        }
+        else {
+            tensor_b.SetNdInfo({ k, n });
+        }
+
+        tensor_c.SetAddr(d_c);
+        tensor_c.SetType(ttype);
+        tensor_c.SetNdInfo({ m, n });
+        if (DType::q8 == dtype) {
+            tensor_c.SetQuantizationInfo(scale_c);
+        }
+
+        tensor_z.SetAddr(d_z);
+        tensor_z.SetType(ttype);
+        tensor_z.SetNdInfo({ n });
+
+        CHECK_MUSA(musaStreamSynchronize(stream));
+        CHECK_MUSA(musaDeviceSynchronize());
+
+
+        op.SetTranspose(trans_a, trans_b);
+        // op.SetSplitK(split_k);
+        op.SetAlpha(alpha);
+        op.SetBeta(beta);
+        op.SetGamma(gamma);
+        op.SetComputeMode(static_cast<::musa::dnn::MatMul::ComputeMode>(mode));
+
+        return true;
+    }
+
+    void Exec(bool sync = false) {
+        CHECK_ERR(::musa::dnn::Status::SUCCESS !=
+            op.RunWithBiasAdd(*handle, tensor_c, tensor_a, tensor_b, tensor_z, MemoryFunc));
+        CHECK_MUSA(musaGetLastError());
+        if (sync) {
+            CHECK_MUSA(musaStreamSynchronize(stream));
+        }
+    }
+};
+
+int RunMatMul() {
+
+
+    int device_id = 5;
+    CHECK_MUSA(musaGetDevice(&device_id));
+
+    MatMulParam param;
+    const int iters = 42000;
+    musaStream_t stream;
+    CHECK_MUSA(musaStreamCreate(&stream));
+    TestMatMul test_mm(stream, device_id, DType::f32, param, iters);
+    bool ret = test_mm.Test();
+    CHECK_MUSA(musaStreamDestroy(stream));
+    return ret;
+}
+
+
+int main(int argc, char* argv[]) {
+    
+    if (argc != 5) {
+        std::cerr << "Usage: " << argv[0] << " <m> <n> <k> <iter>" << std::endl;
+        std::cerr << "Example: " << argv[0] << " 128 128 128 10" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    int m = std::atoi(argv[1]);
+    int n = std::atoi(argv[2]);
+    int k = std::atoi(argv[3]);
+    int iter = std::atoi(argv[4]);
+
+    std::cout << "========================================" << std::endl;
+    std::cout << "MatMul TF32 Test (MUSA)" << std::endl;
+    std::cout << "m = " << m << ", n = " << n << ", k = " << k << std::endl;
+    std::cout << "Test Iterations = " << iter << std::endl;
+
+    int device_id = 0;
+    CHECK_MUSA(musaGetDevice(&device_id));
+
+    MatMulParam param;
+    param.m = m;
+    param.n = n;
+    param.k = k;
+    const int iters = iter;
+    musaStream_t stream;
+    CHECK_MUSA(musaStreamCreate(&stream));
+    TestMatMul test_mm(stream, device_id, DType::f32, param, iters);
+    bool ret = test_mm.Test();
+    CHECK_MUSA(musaStreamDestroy(stream));
+    return ret;
+}
diff --git a/base_test/matmul_test/test_gemm_f32_f16_bf16_q8_fp8.sh b/base_test/matmul_test/test_gemm_f32_f16_bf16_q8_fp8.sh
new file mode 100644
index 0000000..736c237
--- /dev/null
+++ b/base_test/matmul_test/test_gemm_f32_f16_bf16_q8_fp8.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+input_data=$(cat <<'EOF'
+128	128	128
+256	256	256
+512	512	512
+1024	1024	1024
+2048	2048	2048
+4096	4096	4096
+8192	8192	8192
+4098	4098	4098
+8190	8190	8190
+EOF
+)
+test_iter=1000
+
+TEST_TYPES=("f32" "f16" "bf16" "q8" "float8_e4m3" "float8_e5m2")
+# TEST_TYPES=("f32")
+LOG_DIR="mudnn_bench_logs"
+mkdir -p "$LOG_DIR"
+log_file="${LOG_DIR}/bench_f32_f16_bf16_q8_fp8.log"
+> "$log_file"
+
+for type in "${TEST_TYPES[@]}"; do
+    echo "开始测试数据类型：$type"
+    while IFS=$'\t' read -r m n k; do
+        m=$(echo "$m" | tr -d ' ')
+        n=$(echo "$n" | tr -d ' ')
+        k=$(echo "$k" | tr -d ' ')
+        echo "$m $n $k"
+        
+        if [[ -n "$m" && -n "$n" && -n "$k" ]]; then
+            MUSA_VISIBLE_DEVICES=7 ../bin/mudnn_bench -m \
+                -t "$type" \
+                --mm_m="$m" --mm_n="$n" --mm_k="$k" \
+                --mm_mode=0 \
+                --tm i \
+                --tmv "$test_iter" \
+                -p \
+                >> "$log_file" 2>&1 
+            sleep 2
+        fi
+    done < <(echo "$input_data") 
+done 
+
+python exetrct_log_tools/summarize_f32_f16_bf16_q8_fp8_log.py "$log_file"
+
+echo "所有测试完成！日志目录：$LOG_DIR"
diff --git a/base_test/matmul_test/test_gemm_fp64_tf32.sh b/base_test/matmul_test/test_gemm_fp64_tf32.sh
new file mode 100644
index 0000000..924c556
--- /dev/null
+++ b/base_test/matmul_test/test_gemm_fp64_tf32.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+# 输入矩阵大小列表
+input_data=$(cat <<'EOF'
+128 128 128
+256 256 256
+512 512 512
+1024 1024 1024
+2048 2048 2048
+4096 4096 4096
+8192 8192 8192
+4098 4098 4098
+8190 8190 8190
+8192 768 8192
+EOF
+)
+
+# 每组测试迭代次数
+test_iter=1000
+
+# 测试类型列表
+TEST_TYPES=("fp64" "tf32")
+
+# GEMM 可执行文件目录
+EXE_DIR="./fp64_tf32_src"
+
+# 日志目录
+LOG_DIR="mudnn_bench_logs"
+mkdir -p "$LOG_DIR"
+ABS_LOG_DIR=$(realpath "$LOG_DIR")
+log_file="${ABS_LOG_DIR}/bench_fp64_tf32_types.log"
+> "$log_file"
+
+# Python 分析脚本路径
+PYTHON_SUMMARIZE="exetrct_log_tools/summarize_fp64_tf32_log.py"
+
+for type in "${TEST_TYPES[@]}"; do
+    echo "=============================="
+    echo "开始测试：$type"
+    echo "=============================="
+
+    # 根据类型选择可执行文件
+    if [[ "$type" == "fp64" ]]; then
+        exe="${EXE_DIR}/gemm_fp64"
+    elif [[ "$type" == "tf32" ]]; then
+        exe="${EXE_DIR}/gemm_tf32"
+    else
+        echo "未知类型: $type"
+        continue
+    fi
+
+    # 检查可执行文件是否存在
+    if [[ ! -f "$exe" ]]; then
+        echo "错误：找不到可执行文件 $exe"
+        continue
+    fi
+
+    # 遍历矩阵大小
+    while read -r m n k; do
+        # 清理可能的空格
+        m=$(echo "$m" | tr -d ' ')
+        n=$(echo "$n" | tr -d ' ')
+        k=$(echo "$k" | tr -d ' ')
+
+        echo "矩阵大小: M=$m, N=$n, K=$k"
+
+        if [[ -n "$m" && -n "$n" && -n "$k" ]]; then
+            # 执行 GEMM 测试并记录日志
+            MUSA_VISIBLE_DEVICES=7 "$exe" "$m" "$n" "$k" "$test_iter" >> "$log_file" 2>&1
+            sleep 1
+        fi
+    done <<< "$input_data"
+
+done
+
+# 调用 Python 分析脚本
+if [[ -f "$PYTHON_SUMMARIZE" ]]; then
+    python "$PYTHON_SUMMARIZE" "$log_file"
+else
+    echo "警告：Python 分析脚本不存在: $PYTHON_SUMMARIZE"
+fi
+
+echo "所有测试完成！日志目录：$ABS_LOG_DIR"
+
diff --git a/base_test/matmul_test/test_gemm_mixed.sh b/base_test/matmul_test/test_gemm_mixed.sh
new file mode 100644
index 0000000..076a95d
--- /dev/null
+++ b/base_test/matmul_test/test_gemm_mixed.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+set -e
+
+input_data=$(cat <<'EOF'
+128     128     128
+256     256     256
+512     512     512
+1024    1024    1024
+2048    2048    2048
+4096    4096    4096
+8192    8192    8192
+4098    4098    4098
+8190    8190    8190
+8192    768     8192
+EOF
+)
+test_iter=1000
+
+TEST_TYPES=("f16:f16:f32:f32" "bf16:bf16:f32:f32" "f32" "int8" "q8:q8:f32:f32" "bf16:q4:bf16:bf16" "float8_e4m3:float8_e4m3:f16:f16")
+# TEST_TYPES=("f32")
+LOG_DIR="mudnn_bench_logs"
+mkdir -p "$LOG_DIR"
+log_file="${LOG_DIR}/bench_fix_matmul.log"
+> "$log_file"
+
+# 先测试命令是否存在
+if [ ! -f "../bin/mudnn_bench" ]; then
+    echo "错误：未找到 ../bin/mudnn_bench 可执行文件" | tee -a "$log_file"
+    exit 1
+fi
+
+echo "开始测试，日志文件：$log_file"
+
+for type in "${TEST_TYPES[@]}"; do
+    echo "开始测试数据类型：$type" | tee -a "$log_file"
+    
+    # 使用 while 循环逐行读取
+    echo "$input_data" | while IFS= read -r line; do
+        # 跳过空行
+        [ -z "$line" ] && continue
+        
+        # 使用 awk 或直接读取三个数字
+        # 方法1：使用 read
+        read m n k <<< "$line"
+        
+        # 或者方法2：使用 awk（更可靠）
+        # m=$(echo "$line" | awk '{print $1}')
+        # n=$(echo "$line" | awk '{print $2}')
+        # k=$(echo "$line" | awk '{print $3}')
+        
+        echo "测试: M=$m, N=$n, K=$k, Type=$type" | tee -a "$log_file"
+        
+        # 检查参数是否正确
+        if ! [[ "$m" =~ ^[0-9]+$ ]] || ! [[ "$n" =~ ^[0-9]+$ ]] || ! [[ "$k" =~ ^[0-9]+$ ]]; then
+            echo "错误：参数不是数字: m=$m, n=$n, k=$k" | tee -a "$log_file"
+            continue
+        fi
+
+        # 临时保存命令
+        cmd="MUSA_VISIBLE_DEVICES=7 ../bin/mudnn_bench -m --mm_m=\"$m\" --mm_n=\"$n\" --mm_k=\"$k\" --warmup 30 --tm i --tmv \"$test_iter\" -p -c -t \"$type\""
+        echo "执行命令: $cmd" >> "$log_file"
+        
+        # 执行命令并捕获退出状态
+        if MUSA_VISIBLE_DEVICES=7 ../bin/mudnn_bench -m \
+            --mm_m="$m" --mm_n="$n" --mm_k="$k" \
+            --warmup 30 \
+            --tm i \
+            --tmv "$test_iter" \
+            -p \
+            -c \
+            -t "$type" >> "$log_file" 2>&1; then
+            echo "测试成功: M=$m, N=$n, K=$k, Type=$type" | tee -a "$log_file"
+        else
+            exit_code=$?
+            echo "测试失败: M=$m, N=$n, K=$k, Type=$type, 退出码: $exit_code" | tee -a "$log_file"
+        fi
+        
+        echo "----------------------------------------" >> "$log_file"
+        sleep 2
+    done
+done
+
+python sexetrct_log_tool/summary_mixed_data.py  "$log_file"
+echo "所有测试完成！日志目录：$LOG_DIR"
+echo "查看日志：cat $log_file"
diff --git a/script/monitor/README.md b/base_test/monitor/README.md
similarity index 100%
rename from script/monitor/README.md
rename to base_test/monitor/README.md
diff --git a/script/monitor/monitor_gpu.sh b/base_test/monitor/monitor_gpu.sh
old mode 100755
new mode 100644
similarity index 100%
rename from script/monitor/monitor_gpu.sh
rename to base_test/monitor/monitor_gpu.sh

From b8f06d97a3a163dcacce5ec4197b6b81326cf97e Mon Sep 17 00:00:00 2001
From: Wang Kang <kang.wang-EXT@mthreads.com>
Date: Fri, 5 Dec 2025 16:08:35 +0800
Subject: [PATCH 2/4] =?UTF-8?q?=E8=A1=A5=E5=85=85=E6=B7=B7=E5=90=88?=
 =?UTF-8?q?=E7=B2=BE=E5=BA=A6=E8=AF=B4=E6=98=8E?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 base_test/matmul_test/READE.md | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/base_test/matmul_test/READE.md b/base_test/matmul_test/READE.md
index c54a072..68b7532 100644
--- a/base_test/matmul_test/READE.md
+++ b/base_test/matmul_test/READE.md
@@ -1,4 +1,4 @@
-Matmul 自动化测试脚本
+Matmul 自动化测试
 # 1. 脚本说明
 matmul 存放位置：
 ```shell
@@ -9,7 +9,8 @@ mudnn_bench
 │   └── mudnn_bench-x.x.x
 ├── matmul_test
 ```
-mudnn_bench 示例：
+mudnn_bench 示例：  
+**部分旧版本mudnn_bench和mudnn版本不支持混合精度测试，需要和开发者做确认.**
 ```shell
 
 # 示例 1：单卡，大矩阵，f32
@@ -26,7 +27,9 @@ MUSA_VISIBLE_DEVICES=3 ./bin/mudnn_bench -m --mm_m 2048 --mm_n 2048 --mm_k 2048
 ```
 
 # 2. 测试
+可在测试脚本中自行批量配置测试MNK，warmup，iter等。
 ## 2.1 fp64, tf32 测试
+注意：fp64和tf32 数据类型调用非 mudnn 接口
 ```shell
 # 1. 编译
 bash ./fp64_tf32_src/build_gemm_tf32.sh
@@ -38,6 +41,12 @@ bash test_gemm_fp64_tf32.sh
 ```
 
 ## 2.2 f32_f16_bf16_q8_fp8 测试
+mudnn_bench 测试矩阵value默认说明：
+- 浮点：-0.5~0.5  
+- fp8: 整型-10~10转浮点  
+- qint4：-7～7 
+- 整型：-127~127  
+> 部分版本 mudnn_bench 工具支持全 0 测试(参数 `-z` 实现)，需要和开发者确认
 ```shell
 bash test_gemm_f32_f16_bf16_q8_fp8.sh
 ```

From d9b077c000844a900519247e2bed5bda2646b40c Mon Sep 17 00:00:00 2001
From: Wang Kang <kang.wang-EXT@mthreads.com>
Date: Fri, 5 Dec 2025 16:51:11 +0800
Subject: [PATCH 3/4] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E8=AF=B4=E6=98=8E?=
 =?UTF-8?q?=E4=BB=A5=E5=8F=8A=E7=BC=96=E8=AF=91=E8=B7=AF=E5=BE=84=E9=97=AE?=
 =?UTF-8?q?=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 base_test/matmul_test/READE.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/base_test/matmul_test/READE.md b/base_test/matmul_test/READE.md
index 68b7532..59ed02f 100644
--- a/base_test/matmul_test/READE.md
+++ b/base_test/matmul_test/READE.md
@@ -1,7 +1,8 @@
 Matmul 自动化测试
 # 1. 脚本说明
-matmul 存放位置：
+matmul 放置位置：
 ```shell
+# mudnn_bench 默认存放在 /usr/local/musa/ 下
 mudnn_bench
 ├── bench_test_matmul.sh
 ├── bin
@@ -32,9 +33,10 @@ MUSA_VISIBLE_DEVICES=3 ./bin/mudnn_bench -m --mm_m 2048 --mm_n 2048 --mm_k 2048
 注意：fp64和tf32 数据类型调用非 mudnn 接口
 ```shell
 # 1. 编译
-bash ./fp64_tf32_src/build_gemm_tf32.sh
+cd ./fp64_tf32_src
 
-bash ./fp64_tf32_src/build_gemm_fp64.sh
+bash build_gemm_tf32.sh
+bash build_gemm_fp64.sh
 
 ## 2. 测试
 bash test_gemm_fp64_tf32.sh

From 1de46505642ba29abc0729d491b1b19f20680c91 Mon Sep 17 00:00:00 2001
From: Wang Kang <kang.wang-EXT@mthreads.com>
Date: Tue, 13 Jan 2026 13:53:49 +0800
Subject: [PATCH 4/4] add gpu monitor

---
 base_test/monitor/README.md               | 304 ++++++++++++++++++++--
 base_test/monitor/README_sh.md            |  42 +++
 base_test/monitor/mthreads_gpu_monitor.py | 296 +++++++++++++++++++++
 3 files changed, 616 insertions(+), 26 deletions(-)
 create mode 100644 base_test/monitor/README_sh.md
 create mode 100644 base_test/monitor/mthreads_gpu_monitor.py

diff --git a/base_test/monitor/README.md b/base_test/monitor/README.md
index 3d353f3..3c8c93a 100644
--- a/base_test/monitor/README.md
+++ b/base_test/monitor/README.md
@@ -1,42 +1,294 @@
-# GPU 监控脚本使用说明
+# GPU Monitor for MTT (Mthreads)
 
-该脚本用于定时采集指定 GPU 的温度和图形频率，支持高温/降频报警，并可将结果记录到日志文件中，适用于性能测试与运行状态监控。
+一个轻量级的 GPU 监控工具，用于实时采集和监控 Mthreads GPU 的性能指标。
 
----
+## 功能特性
 
-## ✅ 脚本功能
+- 🚀 **实时监控**：每5秒自动刷新 GPU 信息
+- 🔔 **阈值告警**：支持温度和显存占比的告警机制
+- 📊 **CSV 日志**：自动记录 GPU 数据到 CSV 文件
+- 🔄 **灵活使用**：支持单次读取和循环监控两种模式
+- 🧵 **多线程**：后台线程处理，不阻塞主程序
+- 📈 **数据导出**：支持 Dict 和对象两种格式获取数据
 
-- 支持指定 GPU 设备编号
-- 可自定义刷新时间间隔和记录次数
-- 实时记录温度（℃）与图形频率（MHz）
-- 高温（>95°C）或降频（<1750MHz）触发报警（门限可根据实际设备调整）
-- 监控结束后自动统计报警次数
-- 默认输出日志文件为：`gpu_monitor_log.txt`
+## 环境要求
 
----
+- Python 3.6+
+- `mthreads-gmi` 命令可用
 
-## ⚙️ 参数说明
+## 安装
 
-| 参数 | 含义               | 示例            |
-|------|--------------------|-----------------|
-| `-d` | GPU 设备编号        | `-d 0`          |
-| `-i` | 刷新时间（单位：秒）| `-i 1`          |
-| `-n` | 记录次数（默认无限）| `-n 10`         |
+```bash
+# 直接使用（无需额外依赖）
+python3 mthreads_gpu_monitor.py
+```
 
-> 如未指定 `-n`，脚本将持续运行，直到手动停止。
+## 使用方法
 
----
+### 方式1：单次读取 GPU 信息
 
-## 🚀 使用示例
+```python
+from mthreads_gpu_monitor import GPUMonitor
 
-### ✅ 前台运行
+# 创建监控对象
+monitor = GPUMonitor()
 
-```bash
-./monitor_gpu.sh -d 0 -i 1 -n 10
+# 读取一次 GPU 信息
+monitor.update()
+
+# 获取所有 GPU 信息（List[Dict] 格式）
+all_gpus = monitor.to_dict()
+print(all_gpus)
+
+# 获取单张 GPU 信息
+gpu0 = monitor.get_gpu(0)
+print(f"GPU 0 温度: {gpu0.temperature}°C")
+
+# 获取多张 GPU 信息
+gpus = monitor.get_gpu([0, 1, 2])
+for gpu in gpus:
+    print(gpu)
 ```
 
-### ✅ 后台运行
+### 方式2：循环监控（后台自动刷新）
+
+```python
+from mthreads_gpu_monitor import GPUMonitor
+
+# 创建监控对象（配置告警阈值和CSV日志）
+monitor = GPUMonitor(
+    refresh_interval=5,           # 刷新间隔（秒）
+    csv_path="gpu_metrics.csv",   # CSV 日志文件路径
+    alert_config={
+        "temperature": 80,              # 温度告警阈值（°C）
+        "memory_used_ratio": 0.9,       # 显存占比告警阈值（90%）
+    },
+)
+
+# 启动后台监控线程
+monitor.start()
+
+# 主程序继续执行（监控在后台运行）
+import time
+time.sleep(60)
+
+# 停止监控
+monitor.stop()
 ```
-nohup ./monitor_gpu.sh -d 0 -i 1 -n 100 > /dev/null 2>&1 &
-tail -f gpu_monitor_log.txt
+
+### 方式3：自定义告警回调
+
+```python
+from mthreads_gpu_monitor import GPUMonitor, GPUInfo
+
+def custom_alert(gpu: GPUInfo, msg: str):
+    """自定义告警处理函数"""
+    print(f"【自定义告警】{msg}")
+    # 可以在这里发送邮件、钉钉等
+
+monitor = GPUMonitor(
+    refresh_interval=5,
+    csv_path="gpu_metrics.csv",
+    alert_config={
+        "temperature": 80,
+        "memory_used_ratio": 0.9,
+    },
+    alert_callback=custom_alert,  # 传入自定义回调函数
+)
+
+monitor.start()
 ```
+
+## 类和方法说明
+
+### `GPUInfo` 类
+
+GPU 信息的数据类，包含以下属性：
+
+| 属性 | 类型 | 说明 |
+|------|------|------|
+| `index` | int | GPU 索引号 |
+| `model` | str | GPU 型号 |
+| `temperature` | float | 温度（°C） |
+| `power` | float | 功耗（W） |
+| `utilization` | float | GPU 利用率（%） |
+| `memory_total` | float | 显存总量（MiB） |
+| `memory_used` | float | 显存已用（MiB） |
+| `memory_used_ratio` | float | 显存占比（0.0-1.0） |
+
+#### 方法
+
+- `to_dict()` - 返回 Dict 格式的数据
+- `__repr__()` - 返回对象的字符串表示
+
+### `GPUMonitor` 类
+
+GPU 监控器主类。
+
+#### 初始化参数
+
+```python
+GPUMonitor(
+    refresh_interval: int = 5,                              # 刷新间隔（秒）
+    csv_path: Optional[str] = None,                        # CSV 日志路径
+    alert_config: Optional[Dict[str, float]] = None,       # 告警配置
+    alert_callback: Optional[Callable[[GPUInfo, str], None]] = None  # 告警回调
+)
+```
+
+#### 主要方法
+
+| 方法 | 说明 |
+|------|------|
+| `update()` | 立即更新一次 GPU 信息（含告警和CSV记录） |
+| `start()` | 启动后台监控线程（定时调用 update）|
+| `stop()` | 停止后台监控线程 |
+| `to_dict()` | 返回所有 GPU 信息的 Dict 列表 |
+| `get_gpu(index)` | 按索引获取单张或多张 GPU 信息 |
+
+## 示例输出
+
+### 方式1：单次读取
+```
+所有 GPU 信息:
+[
+    {
+        'index': 0,
+        'model': 'MTT S4000',
+        'temperature': 75.0,
+        'power': 274.7,
+        'utilization': 0.0,
+        'memory_total': 49152.0,
+        'memory_used': 516.0,
+        'memory_used_ratio': 0.0105
+    },
+    ...
+]
+
+第0号 GPU 的 memory_total 属性:
+49152.0
+```
+
+### 方式2：循环监控
+```
+GPU 监控程序已启动...
+每 5 秒刷新一次，温度 ≥80°C 或显存占比 ≥90% 时告警
+CSV日志保存到: gpu_metrics.csv
+按 Ctrl+C 停止监控
+
+[2026-01-13 12:29:55] GPU Monitor Status:
+--------------------------------------------------------------------------------
+GPU 0 (MTT S4000):
+  温度:   75.0°C  | 功耗:  274.7W
+  显存:     516/  49152 MiB (  1.0%)
+  利用率:   0.0%
+GPU 1 (MTT S4000):
+  温度:   63.0°C  | 功耗:  253.9W
+  显存:     516/  49152 MiB (  1.0%)
+  利用率:   0.0%
+...
+```
+
+## CSV 日志格式
+
+自动生成的 CSV 文件包含以下列：
+
+```csv
+timestamp,gpu_index,model,temperature,utilization,memory_used,memory_total,power
+2026-01-13T12:29:55.123456,0,MTT S4000,75.0,0.0,516,49152,274.7
+2026-01-13T12:29:55.123456,1,MTT S4000,63.0,0.0,516,49152,253.9
+```
+
+## 告警机制
+
+### 默认告警
+
+当以下条件满足时，会触发告警：
+
+1. **温度告警**：`temperature >= alert_config["temperature"]`
+2. **显存告警**：`memory_used_ratio >= alert_config["memory_used_ratio"]`
+
+### 告警输出
+
+```
+[ALERT] GPU 0 temperature exceeded | temp=85.5C mem_ratio=0.55
+[ALERT] GPU 2 memory exceeded | temp=70.0C mem_ratio=0.92
+```
+
+### 自定义告警
+
+通过 `alert_callback` 参数传入自定义函数处理告警：
+
+```python
+def send_alert_email(gpu: GPUInfo, msg: str):
+    # 发送邮件
+    pass
+
+monitor = GPUMonitor(alert_callback=send_alert_email)
+```
+
+## 常见问题
+
+### Q: 如何在实际程序中集成此监控工具？
+
+A: 启动监控线程后，主程序可以继续执行其他任务，监控在后台运行：
+
+```python
+monitor = GPUMonitor(...)
+monitor.start()
+
+# 主程序代码
+for i in range(100):
+    # 处理任务...
+    pass
+
+monitor.stop()
+```
+
+### Q: 如何获取最新的 GPU 数据？
+
+A: 在循环监控模式下，访问 `monitor.gpus` 即可获取最新数据：
+
+```python
+monitor.start()
+time.sleep(10)
+for gpu in monitor.gpus:
+    print(gpu.temperature)
+```
+
+### Q: 支持多进程吗？
+
+A: 支持。每个 GPUMonitor 实例独立运行，可创建多个实例进行监控。
+
+### Q: 告警阈值可以动态修改吗？
+
+A: 可以，修改 `monitor.alert_config` 字典即可：
+
+```python
+monitor.alert_config["temperature"] = 90  # 修改温度告警阈值
+```
+
+## 故障排除
+
+### 错误：`mthreads-gmi: command not found`
+
+确保 `mthreads-gmi` 命令已正确安装并在 PATH 中。
+
+### 数据为空
+
+检查是否有 Mthreads GPU 硬件连接，运行：
+```bash
+mthreads-gmi -q --json
+```
+
+### CSV 文件权限问题
+
+确保对 CSV 文件路径的目录有写权限。
+
+## 许可证
+
+MIT
+
+## 作者
+
+wangkang
diff --git a/base_test/monitor/README_sh.md b/base_test/monitor/README_sh.md
new file mode 100644
index 0000000..3d353f3
--- /dev/null
+++ b/base_test/monitor/README_sh.md
@@ -0,0 +1,42 @@
+# GPU 监控脚本使用说明
+
+该脚本用于定时采集指定 GPU 的温度和图形频率，支持高温/降频报警，并可将结果记录到日志文件中，适用于性能测试与运行状态监控。
+
+---
+
+## ✅ 脚本功能
+
+- 支持指定 GPU 设备编号
+- 可自定义刷新时间间隔和记录次数
+- 实时记录温度（℃）与图形频率（MHz）
+- 高温（>95°C）或降频（<1750MHz）触发报警（门限可根据实际设备调整）
+- 监控结束后自动统计报警次数
+- 默认输出日志文件为：`gpu_monitor_log.txt`
+
+---
+
+## ⚙️ 参数说明
+
+| 参数 | 含义               | 示例            |
+|------|--------------------|-----------------|
+| `-d` | GPU 设备编号        | `-d 0`          |
+| `-i` | 刷新时间（单位：秒）| `-i 1`          |
+| `-n` | 记录次数（默认无限）| `-n 10`         |
+
+> 如未指定 `-n`，脚本将持续运行，直到手动停止。
+
+---
+
+## 🚀 使用示例
+
+### ✅ 前台运行
+
+```bash
+./monitor_gpu.sh -d 0 -i 1 -n 10
+```
+
+### ✅ 后台运行
+```
+nohup ./monitor_gpu.sh -d 0 -i 1 -n 100 > /dev/null 2>&1 &
+tail -f gpu_monitor_log.txt
+```
diff --git a/base_test/monitor/mthreads_gpu_monitor.py b/base_test/monitor/mthreads_gpu_monitor.py
new file mode 100644
index 0000000..b16ca03
--- /dev/null
+++ b/base_test/monitor/mthreads_gpu_monitor.py
@@ -0,0 +1,296 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# author: wangkang
+
+"""
+GPU Monitor for MTT (mthreads-gmi)
+
+Features:
+- Periodic GPU info refresh
+- CSV logging
+- Threshold alerts (temperature / memory)
+"""
+
+import json
+import subprocess
+import threading
+import time
+import csv
+from datetime import datetime
+from typing import List, Dict, Any, Optional, Callable, Union
+
+
+class GPUInfo:
+    def __init__(
+        self,
+        index: int,
+        model: str,
+        memory_total: float,
+        memory_used: float,
+        utilization: float,
+        temperature: float,
+        power: float,
+    ):
+        self.index = index
+        self.model = model
+        self.memory_total = memory_total
+        self.memory_used = memory_used
+        self.utilization = utilization
+        self.temperature = temperature
+        self.power = power
+
+
+    @property
+    def memory_used_ratio(self) -> float:
+        if self.memory_total <= 0:
+            return 0.0
+        return self.memory_used / self.memory_total
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "index": self.index,
+            "model": self.model,
+            "memory_total": self.memory_total,
+            "memory_used": self.memory_used,
+            "memory_used_ratio": self.memory_used_ratio,
+            "utilization": self.utilization,
+            "temperature": self.temperature,
+            "power": self.power,
+        }
+
+    def __repr__(self) -> str:
+        return (
+            f"GPUInfo(index={self.index}, model='{self.model}', "
+            f"util={self.utilization}%, temp={self.temperature}C, "
+            f"memory_used={self.memory_used}MiB, "
+            f"memory_total={self.memory_total}MiB, "
+            f"power={self.power}W)"
+        )
+
+
+class GPUMonitor:
+    def __init__(
+        self,
+        refresh_interval: int = 5,
+        csv_path: Optional[str] = None,
+        alert_config: Optional[Dict[str, float]] = None,
+        alert_callback: Optional[Callable[[GPUInfo, str], None]] = None,
+    ):
+        """
+        refresh_interval: 刷新间隔（秒）
+        csv_path: CSV 保存路径（None 表示不保存）
+        alert_config:
+            {
+                "temperature": 80,
+                "memory_used_ratio": 0.9
+            }
+        """
+        self.command = ["mthreads-gmi", "-q", "--json"]
+        self.refresh_interval = refresh_interval
+        self.csv_path = csv_path
+        self.alert_config = alert_config or {}
+        self.alert_callback = alert_callback
+
+        self.gpus: List[GPUInfo] = []
+
+        self._stop_event = threading.Event()
+        self._thread: Optional[threading.Thread] = None
+
+    def _extract_float(self, value: Any, unit: str = "") -> float:
+        if isinstance(value, (int, float)):
+            return float(value)
+        return float(str(value).rstrip(unit).strip())
+
+    def _run_command(self) -> Optional[List[GPUInfo]]:
+        try:
+            result = subprocess.run(
+                self.command,
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+
+            if result.returncode != 0:
+                print("mthreads-gmi failed:", result.stderr)
+                return None
+
+            data = json.loads(result.stdout)
+            gpus: List[GPUInfo] = []
+
+            for gpu in data.get("GPU", []):
+                gpus.append(
+                    GPUInfo(
+                        index=int(gpu.get("Index", -1)),
+                        model=gpu.get("Product Name", "Unknown"),
+                        memory_total=self._extract_float(
+                            gpu.get("FB Memory Usage", {}).get("Total", 0), "MiB"
+                        ),
+                        memory_used=self._extract_float(
+                            gpu.get("FB Memory Usage", {}).get("Used", 0), "MiB"
+                        ),
+                        utilization=self._extract_float(
+                            gpu.get("Utilization", {}).get("Gpu", 0), "%"
+                        ),
+                        temperature=self._extract_float(
+                            gpu.get("Temperature", {}).get("GPU Current Temp", "0C"), "C"
+                        ),
+                        power=self._extract_float(
+                            gpu.get("Power Readings", {}).get("Power Draw ", "0W"), "W"
+                        ),
+                    )
+                )
+            return gpus
+
+        except Exception as e:
+            print("GPU query error:", e)
+            return None
+
+    def update(self):
+        """更新GPU信息并处理告警和CSV日志"""
+        gpus = self._run_command()
+        if gpus:
+            self.gpus = gpus
+            self._check_alerts()
+            if self.csv_path:
+                self._save_csv()
+
+    def to_dict(self) -> List[Dict[str, Any]]:
+        """
+        返回所有GPU信息（dict格式）
+        """
+        return [gpu.to_dict() for gpu in self.gpus]
+    
+    def get_gpu(self, index: Union[int, List[int]]) -> Optional[GPUInfo]:
+        """
+        按index获取单张GPU
+        """
+        if isinstance(index, int):
+            return self.gpus[index] if 0 <= index < len(self.gpus) else None
+        elif isinstance(index, list):
+            return [self.gpus[i] for i in index if 0 <= i < len(self.gpus)]
+        return None
+
+    def start(self):
+        if self._thread and self._thread.is_alive():
+            return
+        self._stop_event.clear()
+        self._thread = threading.Thread(target=self._loop, daemon=True)
+        self._thread.start()
+
+    def stop(self):
+        self._stop_event.set()
+
+    def _loop(self):
+        while not self._stop_event.is_set():
+            self.update()
+            time.sleep(self.refresh_interval)
+
+    def _save_csv(self):
+        file_exists = False
+        try:
+            with open(self.csv_path, "r"):
+                file_exists = True
+        except FileNotFoundError:
+            pass
+
+        with open(self.csv_path, "a", newline="") as f:
+            writer = csv.writer(f)
+
+            if not file_exists:
+                writer.writerow([
+                    "timestamp",
+                    "gpu_index",
+                    "model",
+                    "temperature",
+                    "utilization",
+                    "memory_used",
+                    "memory_total",
+                    "power",
+                ])
+
+            ts = datetime.now().isoformat()
+            for gpu in self.gpus:
+                writer.writerow([
+                    ts,
+                    gpu.index,
+                    gpu.model,
+                    gpu.temperature,
+                    gpu.utilization,
+                    gpu.memory_used,
+                    gpu.memory_total,
+                    gpu.power,
+                ])
+
+
+    def _check_alerts(self):
+        for gpu in self.gpus:
+            if "temperature" in self.alert_config:
+                if gpu.temperature >= self.alert_config["temperature"]:
+                    self._alert(gpu, "temperature")
+
+            if "memory_used_ratio" in self.alert_config:
+                if gpu.memory_used_ratio >= self.alert_config["memory_used_ratio"]:
+                    self._alert(gpu, "memory")
+
+    def _alert(self, gpu: GPUInfo, alert_type: str):
+        msg = (
+            f"[ALERT] GPU {gpu.index} {alert_type} exceeded | "
+            f"temp={gpu.temperature}C "
+            f"mem_ratio={gpu.memory_used_ratio:.2f}"
+        )
+        if self.alert_callback:
+            self.alert_callback(gpu, msg)
+        else:
+            print(msg)
+
+
+
+if __name__ == "__main__":
+
+    # 方式1: 只读取一次 GPU 信息
+
+    monitor = GPUMonitor()
+    monitor.update()  # 直接调用 update() 读取一次
+
+    # 一次性打印所有GPU信息（List[dict]格式）
+    print("=== 方式1: 只读取一次 GPU 信息 ===")
+    print("所有 GPU 信息:")
+    print(monitor.to_dict(), "\n")
+
+    # 打印第0号GPU信息（dict格式）
+    print("第0号 GPU 信息:")
+    print(monitor.gpus[0].to_dict(), "\n")
+
+    # 使用 get_gpu 方法获取 GPUInfo 对象, 并打印其属性
+    print("第0号 GPU 的 memory_total 属性:")
+    print(monitor.get_gpu(0).memory_total, "\n")
+
+    # 获取多张GPU信息
+    print(monitor.get_gpu([0, 1]), "\n")
+
+    
+    # # 方式2: 循环监控（每5秒刷新一次）
+    print("\n=== 方式2: 循环监控 ===")
+    monitor = GPUMonitor(
+        refresh_interval=5,
+        csv_path="gpu_metrics.csv",
+        alert_config={
+            "temperature": 80,
+            "memory_used_ratio": 0.9,
+        },
+    )
+    print("GPU 监控程序已启动...")
+    print("每 5 秒刷新一次，温度 ≥80°C 或显存占比 ≥90% 时告警")
+    print("CSV日志保存到: gpu_metrics.csv")
+    print("按 Ctrl+C 停止监控\n")
+
+    monitor.start()
+
+    time.sleep(30)  # 你要运行的程序！！！
+
+    monitor.stop()
+
+
+
+
+