diff --git a/.ci/run_test_suite.sh b/.ci/run_test_suite.sh index f3524797..4ea24c01 100644 --- a/.ci/run_test_suite.sh +++ b/.ci/run_test_suite.sh @@ -74,6 +74,9 @@ case $TestSuite in "evmonetestsuite") CMAKE_OPTIONS="$CMAKE_OPTIONS -DZEN_ENABLE_EVM=ON -DZEN_ENABLE_LIBEVM=ON" ;; + "benchmarksuite") + CMAKE_OPTIONS="$CMAKE_OPTIONS -DZEN_ENABLE_EVM=ON -DZEN_ENABLE_LIBEVM=ON -DZEN_ENABLE_SINGLEPASS_JIT=OFF -DZEN_ENABLE_MULTIPASS_JIT=ON" + ;; esac case $CPU_EXCEPTION_TYPE in @@ -94,6 +97,10 @@ if [[ $TestSuite == "evmonetestsuite" ]]; then STACK_TYPES=("-DZEN_ENABLE_VIRTUAL_STACK=ON") fi +if [[ $TestSuite == "benchmarksuite" ]]; then + STACK_TYPES=("-DZEN_ENABLE_VIRTUAL_STACK=ON") +fi + export PATH=$PATH:$PWD/build CMAKE_OPTIONS_ORIGIN="$CMAKE_OPTIONS" @@ -153,5 +160,64 @@ for STACK_TYPE in ${STACK_TYPES[@]}; do ./run_unittests.sh ../tests/evmone_unittests/EVMOneMultipassUnitTestsRunList.txt "./libdtvmapi.so,mode=multipass" ./run_unittests.sh ../tests/evmone_unittests/EVMOneInterpreterUnitTestsRunList.txt "./libdtvmapi.so,mode=interpreter" ;; + "benchmarksuite") + # Clone evmone and run performance regression check + EVMONE_DIR="evmone" + if [ ! -d "$EVMONE_DIR" ]; then + git clone --depth 1 --recurse-submodules -b for_test https://github.com/DTVMStack/evmone.git $EVMONE_DIR + fi + + # Set default values for benchmark + BENCHMARK_THRESHOLD=${BENCHMARK_THRESHOLD:-0.10} + BENCHMARK_MODE=${BENCHMARK_MODE:-multipass} + + # Copy DTVM library to evmone directory + cp build/lib/* $EVMONE_DIR/ + + cd $EVMONE_DIR + + # Copy check_performance_regression.py from DTVM repo + cp ../tools/check_performance_regression.py ./ + + # Build evmone if not already built + if [ ! -f "build/bin/evmone-bench" ]; then + cmake -S . -B build -DEVMONE_TESTING=ON -DCMAKE_BUILD_TYPE=Release + cmake --build build --parallel -j 16 + fi + + # Default summary output path (can be overridden via env) + BENCHMARK_SUMMARY_FILE=${BENCHMARK_SUMMARY_FILE:-/tmp/perf_summary.md} + + # Run performance check based on mode + if [ -n "$BENCHMARK_SAVE_BASELINE" ]; then + echo "Saving performance baseline..." + python3 check_performance_regression.py \ + --save-baseline "$BENCHMARK_SAVE_BASELINE" \ + --output-summary "$BENCHMARK_SUMMARY_FILE" \ + --lib ./libdtvmapi.so \ + --mode "$BENCHMARK_MODE" \ + --benchmark-dir test/evm-benchmarks/benchmarks + elif [ -n "$BENCHMARK_BASELINE_FILE" ]; then + echo "Checking performance regression against baseline..." + python3 check_performance_regression.py \ + --baseline "$BENCHMARK_BASELINE_FILE" \ + --threshold "$BENCHMARK_THRESHOLD" \ + --output-summary "$BENCHMARK_SUMMARY_FILE" \ + --lib ./libdtvmapi.so \ + --mode "$BENCHMARK_MODE" \ + --benchmark-dir test/evm-benchmarks/benchmarks + else + echo "Running benchmark suite without comparison..." + python3 check_performance_regression.py \ + --save-baseline benchmark_results.json \ + --output-summary "$BENCHMARK_SUMMARY_FILE" \ + --lib ./libdtvmapi.so \ + --mode "$BENCHMARK_MODE" \ + --benchmark-dir test/evm-benchmarks/benchmarks + cat benchmark_results.json + fi + + cd .. + ;; esac done diff --git a/.github/workflows/dtvm_evm_test_x86.yml b/.github/workflows/dtvm_evm_test_x86.yml index 8763f4a7..03ec1d6a 100644 --- a/.github/workflows/dtvm_evm_test_x86.yml +++ b/.github/workflows/dtvm_evm_test_x86.yml @@ -169,3 +169,116 @@ jobs: export TestSuite=evmonetestsuite bash .ci/run_test_suite.sh + + performance_regression_check: + name: Performance Regression Check (10% threshold) + if: github.event_name == 'pull_request' + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + container: + image: dtvmdev1/dtvm-dev-x64:main + steps: + - name: Check out code + uses: actions/checkout@v3 + with: + submodules: "true" + fetch-depth: 0 + + - name: Setup git safe directory + run: | + echo "Configuring git safe directory: ${{ github.workspace }}" + git config --global --add safe.directory /__w/DTVM/DTVM + + - name: Code Format Check + run: | + ./tools/format.sh check + + - name: Build baseline (${{ github.base_ref }}) + run: | + echo "Building baseline on branch: ${{ github.base_ref }}" + + export LLVM_SYS_150_PREFIX=/opt/llvm15 + export LLVM_DIR=$LLVM_SYS_150_PREFIX/lib/cmake/llvm + export PATH=$LLVM_SYS_150_PREFIX/bin:$PATH + + # Save current state (including untracked files) + git stash push -u -m "perf-check-stash" + git checkout ${{ github.base_ref }} + + # Build baseline + export CMAKE_BUILD_TARGET=Release + export ENABLE_ASAN=false + export RUN_MODE=multipass + export ENABLE_LAZY=false + export ENABLE_MULTITHREAD=true + export TestSuite=benchmarksuite + export CPU_EXCEPTION_TYPE='cpu' + export BENCHMARK_MODE=interpreter + export BENCHMARK_SAVE_BASELINE=/tmp/perf_baseline.json + + bash .ci/run_test_suite.sh + + - name: Build current PR and check regression + id: perf-check + run: | + echo "Building PR branch: ${{ github.sha }}" + + export LLVM_SYS_150_PREFIX=/opt/llvm15 + export LLVM_DIR=$LLVM_SYS_150_PREFIX/lib/cmake/llvm + export PATH=$LLVM_SYS_150_PREFIX/bin:$PATH + + # Switch back to PR branch + git checkout ${{ github.sha }} + git stash pop || true + + # Clean and rebuild for current PR + rm -rf build evmone + + # Build and check + export CMAKE_BUILD_TARGET=Release + export ENABLE_ASAN=false + export RUN_MODE=multipass + export ENABLE_LAZY=false + export ENABLE_MULTITHREAD=true + export TestSuite=benchmarksuite + export CPU_EXCEPTION_TYPE='cpu' + export BENCHMARK_MODE=interpreter + export BENCHMARK_THRESHOLD=0.10 + export BENCHMARK_BASELINE_FILE=/tmp/perf_baseline.json + export BENCHMARK_SUMMARY_FILE=/tmp/perf_summary.md + + bash .ci/run_test_suite.sh + continue-on-error: true + + - name: Comment on PR + if: always() + uses: actions/github-script@v6 + with: + script: | + const fs = require('fs'); + const passed = '${{ steps.perf-check.outcome }}' === 'success'; + let summary = ''; + try { + summary = fs.readFileSync('/tmp/perf_summary.md', 'utf8'); + } catch (e) { + summary = '_No benchmark summary available._'; + } + const icon = passed ? '✅' : '⚠️'; + const title = passed + ? 'Performance Check Passed' + : 'Performance Regression Detected'; + const body = `${icon} **${title}**\n\n${summary}`; + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: body + }); + + - name: Fail on regression + if: steps.perf-check.outcome == 'failure' + run: | + echo "::error::Performance regression detected. See logs for details." + exit 1 diff --git a/tools/check_performance_regression.py b/tools/check_performance_regression.py new file mode 100755 index 00000000..12c0eb2f --- /dev/null +++ b/tools/check_performance_regression.py @@ -0,0 +1,464 @@ +#!/usr/bin/env python3 +""" +Performance regression checker for evmone benchmarks. + +Usage: + # Save baseline results + python check_performance_regression.py --save-baseline baseline.json + + # Check for regressions against baseline + python check_performance_regression.py --baseline baseline.json + + # Check with custom threshold (default 10%) + python check_performance_regression.py --baseline baseline.json --threshold 0.15 + +Exit codes: + 0 - No significant regression detected + 1 - Performance regression detected (> threshold) + 2 - Script error (execution failed, file not found, etc.) +""" + +import argparse +import json +import os +import subprocess +import sys +import tempfile +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + + +@dataclass +class BenchmarkResult: + name: str + time_ns: float # Time in nanoseconds + cpu_time_ns: float + iterations: int + + +def run_benchmark( + lib_path: str, + mode: str, + benchmark_dir: str, + extra_args: Optional[List[str]] = None, +) -> List[BenchmarkResult]: + """Run benchmark and parse JSON output. + + Uses --benchmark_out to write JSON results to a temporary file so that + the human-readable benchmark progress streams to stdout/stderr in real + time (important for CI visibility). + """ + env = {"EVMONE_EXTERNAL_OPTIONS": f"{lib_path},mode={mode}"} + + # Write JSON results to a temp file instead of capturing stdout. + # This lets Google Benchmark's normal console output (one line per + # completed case) stream directly to the CI log in real time. + fd, json_out_path = tempfile.mkstemp(suffix=".json") + os.close(fd) + + cmd = [ + "./build/bin/evmone-bench", + benchmark_dir, + "--benchmark_filter=external/*", + f"--benchmark_out={json_out_path}", + "--benchmark_out_format=json", + ] + + if extra_args: + cmd.extend(extra_args) + + print(f"Running: {' '.join(cmd)}") + print(f"Environment: EVMONE_EXTERNAL_OPTIONS={env['EVMONE_EXTERNAL_OPTIONS']}") + sys.stdout.flush() + + result = subprocess.run( + cmd, + env={**subprocess.os.environ, **env}, + ) + + if result.returncode != 0: + print(f"Benchmark execution failed with code {result.returncode}") + # Clean up temp file on failure + try: + os.unlink(json_out_path) + except OSError: + pass + sys.exit(2) + + # Read JSON results from the temp file + try: + with open(json_out_path, "r") as f: + json_data = f.read() + finally: + try: + os.unlink(json_out_path) + except OSError: + pass + + return parse_benchmark_json(json_data) + + +def parse_benchmark_json(json_output: str) -> List[BenchmarkResult]: + """Parse Google Benchmark JSON output.""" + try: + data = json.loads(json_output) + except json.JSONDecodeError as e: + print(f"Failed to parse JSON: {e}") + sys.exit(2) + + results = [] + for benchmark in data.get("benchmarks", []): + # Skip aggregates like mean, median, stddev + if benchmark.get("run_type") != "iteration": + continue + + results.append( + BenchmarkResult( + name=benchmark["name"], + time_ns=benchmark.get("real_time", 0), + cpu_time_ns=benchmark.get("cpu_time", 0), + iterations=benchmark.get("iterations", 1), + ) + ) + + return results + + +def load_baseline(path: str) -> List[BenchmarkResult]: + """Load baseline results from JSON file.""" + try: + with open(path, "r") as f: + data = json.load(f) + except FileNotFoundError: + print(f"::error::Baseline file not found: {path}") + sys.exit(2) + except json.JSONDecodeError as e: + print(f"::error::Failed to parse baseline JSON: {e}") + sys.exit(2) + + results = [] + for item in data: + results.append( + BenchmarkResult( + name=item["name"], + time_ns=item["time_ns"], + cpu_time_ns=item["cpu_time_ns"], + iterations=item["iterations"], + ) + ) + + return results + + +def save_baseline(results: List[BenchmarkResult], path: str) -> None: + """Save baseline results to JSON file.""" + data = [] + for r in results: + data.append({ + "name": r.name, + "time_ns": r.time_ns, + "cpu_time_ns": r.cpu_time_ns, + "iterations": r.iterations, + }) + + with open(path, "w") as f: + json.dump(data, f, indent=2) + + print(f"Saved {len(results)} benchmark results to {path}") + + +def compare_benchmarks( + current: List[BenchmarkResult], + baseline: List[BenchmarkResult], + threshold: float, +) -> Tuple[bool, List[dict]]: + """ + Compare current results against baseline. + + Returns: + (has_regression, comparison_details) + """ + baseline_map = {b.name: b for b in baseline} + current_map = {c.name: c for c in current} + + # Find missing and new benchmarks + baseline_names = set(baseline_map.keys()) + current_names = set(current_map.keys()) + + missing = baseline_names - current_names + new = current_names - baseline_names + + if missing: + print(f"::warning::Missing benchmarks (in baseline but not in current): {missing}") + if new: + print(f"::notice::New benchmarks (in current but not in baseline): {new}") + + # Compare common benchmarks + comparisons = [] + has_regression = False + + for name in sorted(baseline_names & current_names): + b = baseline_map[name] + c = current_map[name] + + # Calculate percentage change (positive = slower/regression) + time_change = (c.time_ns - b.time_ns) / b.time_ns + cpu_change = (c.cpu_time_ns - b.cpu_time_ns) / b.cpu_time_ns + + # Use the worse of real_time or cpu_time change + max_change = max(time_change, cpu_change) + + is_regression = max_change > threshold + if is_regression: + has_regression = True + + comparisons.append({ + "name": name, + "baseline_time_ns": b.time_ns, + "current_time_ns": c.time_ns, + "time_change": time_change, + "cpu_change": cpu_change, + "max_change": max_change, + "is_regression": is_regression, + }) + + return has_regression, comparisons + + +def print_comparison_table(comparisons: List[dict], threshold: float) -> None: + """Print a formatted comparison table.""" + if not comparisons: + print("No benchmarks to compare.") + return + + # GitHub Actions annotation messages + print("\n" + "=" * 100) + print(f"{'Benchmark':<60} {'Baseline(μs)':<15} {'Current(μs)':<15} {'Change':<12} {'Status'}") + print("=" * 100) + + regression_count = 0 + for comp in comparisons: + name = comp["name"] + baseline_us = comp["baseline_time_ns"] / 1000 + current_us = comp["current_time_ns"] / 1000 + change_pct = comp["max_change"] * 100 + status = "✓ PASS" if not comp["is_regression"] else "✗ FAIL" + + # Truncate long names + display_name = name if len(name) < 60 else name[:57] + "..." + + print(f"{display_name:<60} {baseline_us:<15.2f} {current_us:<15.2f} {change_pct:>+10.1f}% {status}") + + if comp["is_regression"]: + regression_count += 1 + # GitHub Actions warning annotation + print(f"::warning title=Performance Regression::{name} regressed by {change_pct:.1f}% (threshold: {threshold*100:.0f}%)") + + print("=" * 100) + print(f"\nTotal benchmarks: {len(comparisons)}") + print(f"Regressions (> {threshold*100:.0f}%): {regression_count}") + + +def _short_name(name: str) -> str: + """Extract a short display name from the full benchmark name. + + Benchmark names typically look like 'external/some_case/variant'. + We strip the leading 'external/' prefix to keep the table compact. + """ + if name.startswith("external/"): + return name[len("external/"):] + return name + + +def generate_markdown_summary( + comparisons: List[dict], + threshold: float, + has_regression: bool, +) -> str: + """Generate a concise Markdown summary of benchmark comparison results.""" + lines: List[str] = [] + + regression_count = sum(1 for c in comparisons if c["is_regression"]) + + lines.append( + f"**Performance Benchmark Results** (threshold: {threshold*100:.0f}%)" + ) + lines.append("") + + if not comparisons: + lines.append("_No benchmarks to compare._") + return "\n".join(lines) + + # Markdown table header + lines.append("| Benchmark | Baseline (us) | Current (us) | Change | Status |") + lines.append("|-----------|--------------|-------------|--------|--------|") + + for comp in comparisons: + name = _short_name(comp["name"]) + baseline_us = comp["baseline_time_ns"] / 1000 + current_us = comp["current_time_ns"] / 1000 + change_pct = comp["max_change"] * 100 + status = "PASS" if not comp["is_regression"] else "**REGRESSED**" + + lines.append( + f"| {name} | {baseline_us:.2f} | {current_us:.2f} " + f"| {change_pct:+.1f}% | {status} |" + ) + + lines.append("") + lines.append( + f"**Summary**: {len(comparisons)} benchmarks, " + f"{regression_count} regressions" + ) + + return "\n".join(lines) + + +def generate_baseline_summary(results: List[BenchmarkResult]) -> str: + """Generate a concise Markdown summary for a baseline-save run.""" + lines: List[str] = [] + lines.append("**Baseline Benchmark Results**") + lines.append("") + lines.append("| Benchmark | Time (us) |") + lines.append("|-----------|----------|") + + for r in results: + name = _short_name(r.name) + time_us = r.time_ns / 1000 + lines.append(f"| {name} | {time_us:.2f} |") + + lines.append("") + lines.append(f"**Total**: {len(results)} benchmarks collected") + return "\n".join(lines) + + +def main(): + parser = argparse.ArgumentParser( + description="Check for performance regressions in evmone benchmarks", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Save baseline after a known-good commit + python check_performance_regression.py --save-baseline baseline.json + + # Check current commit against baseline in CI + python check_performance_regression.py --baseline baseline.json + + # Check with custom threshold (15% instead of default 10%) + python check_performance_regression.py --baseline baseline.json --threshold 0.15 + + # Specify different library or benchmark directory + python check_performance_regression.py --baseline baseline.json --lib ./other.so --mode jit +""", + ) + + parser.add_argument( + "--baseline", + metavar="PATH", + help="Path to baseline JSON file for comparison", + ) + parser.add_argument( + "--save-baseline", + metavar="PATH", + help="Run benchmarks and save results to file (use this to create baseline)", + ) + parser.add_argument( + "--threshold", + type=float, + default=0.10, + help="Regression threshold as ratio (default: 0.10 = 10%%)", + ) + parser.add_argument( + "--lib", + default="./libdtvmapi.so", + help="Path to the library to benchmark (default: ./libdtvmapi.so)", + ) + parser.add_argument( + "--mode", + default="interpreter", + help="Mode for the library (default: interpreter)", + ) + parser.add_argument( + "--benchmark-dir", + default="test/evm-benchmarks/benchmarks", + help="Path to benchmark directory (default: test/evm-benchmarks/benchmarks)", + ) + parser.add_argument( + "--output-summary", + metavar="PATH", + help="Write a concise Markdown summary to the given file (for PR comments)", + ) + parser.add_argument( + "--verbose", + "-v", + action="store_true", + help="Verbose output", + ) + + args = parser.parse_args() + + if not args.baseline and not args.save_baseline: + parser.error("Either --baseline or --save-baseline must be specified") + + # Run benchmarks + try: + current_results = run_benchmark( + lib_path=args.lib, + mode=args.mode, + benchmark_dir=args.benchmark_dir, + ) + except Exception as e: + print(f"::error::Failed to run benchmarks: {e}") + sys.exit(2) + + if not current_results: + print("::error::No benchmark results found") + sys.exit(2) + + print(f"Collected {len(current_results)} benchmark results") + + # Save baseline mode + if args.save_baseline: + save_baseline(current_results, args.save_baseline) + if args.output_summary: + summary_md = generate_baseline_summary(current_results) + with open(args.output_summary, "w") as f: + f.write(summary_md) + print(f"Wrote baseline summary to {args.output_summary}") + return 0 + + # Compare mode + baseline_results = load_baseline(args.baseline) + print(f"Loaded {len(baseline_results)} baseline results from {args.baseline}") + + has_regression, comparisons = compare_benchmarks( + current_results, + baseline_results, + args.threshold, + ) + + print_comparison_table(comparisons, args.threshold) + + # Write Markdown summary for PR comments + if args.output_summary: + summary_md = generate_markdown_summary( + comparisons, args.threshold, has_regression + ) + with open(args.output_summary, "w") as f: + f.write(summary_md) + print(f"Wrote comparison summary to {args.output_summary}") + + # Summary for GitHub Actions + print("\n" + "=" * 100) + if has_regression: + print(f"::error::Performance regression detected! Some benchmarks exceeded {args.threshold*100:.0f}% threshold.") + print("RESULT: FAIL") + return 1 + else: + print("::notice::No significant performance regression detected.") + print("RESULT: PASS") + return 0 + + +if __name__ == "__main__": + sys.exit(main())