From bd552b62732954819ef636ee15d4ce3d66bd6fed Mon Sep 17 00:00:00 2001 From: cl507523 Date: Tue, 10 Feb 2026 11:27:13 +0000 Subject: [PATCH 1/2] perf(evm): improve shift RA in EVM JIT --- docs/jit-shift-compilation-analysis.md | 226 ++++++++++++++++++ .../evm_frontend/evm_mir_compiler.cpp | 29 ++- src/compiler/evm_frontend/evm_mir_compiler.h | 25 +- 3 files changed, 271 insertions(+), 9 deletions(-) create mode 100644 docs/jit-shift-compilation-analysis.md diff --git a/docs/jit-shift-compilation-analysis.md b/docs/jit-shift-compilation-analysis.md new file mode 100644 index 00000000..3e13de86 --- /dev/null +++ b/docs/jit-shift-compilation-analysis.md @@ -0,0 +1,226 @@ +# JIT Compilation Performance Analysis: RA-Expensive Opcodes + +## Problem Summary + +EVM shift opcodes (SHL/SHR/SAR) generate long `SelectInstruction` chains in MIR +(~15 MIR instructions per shift opcode). When hundreds or thousands of shift +operations appear in a single EVM function, the greedy register allocator's +complexity becomes superlinear (approaching O(n^2)), causing compilation times to +explode from milliseconds to minutes. + +## Root Cause + +Each shift opcode expands to a U256 shift implemented as 4 x i64 component +shifts with cross-component carry propagation. This generates per-component +`SelectInstruction` chains: + +``` +SrcValue = Select(IsMatch, Value[0], SrcValue) +SrcValue = Select(IsMatch, Value[1], SrcValue) +SrcValue = Select(IsMatch, Value[2], SrcValue) +SrcValue = Select(IsMatch, Value[3], SrcValue) +SrcValue = Select(IsInBounds, SrcValue, Zero) +// ... similar chain for CarryValue ... +``` + +The register allocator (greedy RA) struggles when: +1. Many such chains exist in a single basic block +2. Virtual register live ranges overlap extensively +3. Eviction/splitting cascades compound the cost + +## Two Distinct Patterns + +### Pattern b0: min_stack (DUP-same-operand) + +**Bytecode**: `DUP1 SHL DUP1 SHL DUP1 SHL ...` (interleaved) + +**Key characteristic**: Each SHL's two operands are identical (`Shift == Value`) +because DUP1 duplicates the top-of-stack, and SHL pops both from the same +duplicated value. + +**Root cause**: The shift result feeds back as BOTH operands of the next shift +via DUP. This creates a serial feedback loop where each Select chain's live +ranges overlap with all subsequent chains. The same value cycles through +`handleShift` repeatedly, creating exponentially overlapping live ranges. + +**Compilation time** (Release, codeSize=2087, ~1023 SHL ops): +- Without fix: ~78 seconds +- With fix (protectUnsafeValue on intermediates): **~6 seconds** (13x improvement) + +### Pattern b1: full_stack (DUP-then-shift) + +**Bytecode**: `DUP1 x1023` then `SHL x1022` then `POP` (batched) + +**Key characteristic**: All 1023 DUPs push the SAME `counter` value onto the +stack. Each SHL consumes the previous SHL result (top) and an original `counter` +copy (second). So `Shift != Value` for all SHLs after the first. + +**Root cause**: `counter[0..3]` (4 MInstruction*) are each used by ~1022 +different SHL calls spread across the entire function. Their live ranges span the +entire function, creating massive interference with all Select chain +intermediates. The problem is fundamentally about **large fan-out** of a single +value, not about dependency chains. + +**Compilation time** (Release, codeSize=2087, ~1023 SHL ops): +- Without any fix: ~57-132 seconds (varies by opcode) +- Input-level protectUnsafeValue: ~67-145 seconds (no improvement, sometimes worse) + +## Implemented Fix: DUP Pattern Detection (b0) + +**Location**: `src/compiler/evm_frontend/evm_mir_compiler.h` (`handleShift`) +and `src/compiler/evm_frontend/evm_mir_compiler.cpp` (handleLeftShift, +handleLogicalRightShift, handleArithmeticRightShift) + +**Detection**: In `handleShift`, after `extractU256Operand`: +```cpp +bool BreakLiveRanges = (Shift == Value); +``` +`std::array::operator==` compares all 4 `MInstruction*` pointers. When both +operands come from the same DUP'd stack value, the pointers are identical. + +**Mitigation**: When `BreakLiveRanges == true`, insert `protectUnsafeValue` +(Dassign + Dread pair) after the Select chain outputs for `SrcValue` and +`CarryValue` inside each handler. This forces a spill/reload that breaks the +long live ranges of the Select chain outputs, preventing the RA from building +up massive interference graphs. + +**Result**: b0 compilation reduced from ~78s to ~6s with no b1 regression. + +## Unresolved: b1 Pattern + +### Why protectUnsafeValue doesn't help b1 + +**Intermediate protection** (SrcValue/CarryValue after Select chains): +Adds extra VRs inside the Select chain, extending chains and making RA worse. +Result: b1 regressed from ~132s to ~151s. + +**Input protection** (Value components before Select chains): +Creates fresh copies via Dassign/Dread, but `counter[i]` is still USED by +~1022 Dassign instructions. Its live range still spans the entire function. +The RA complexity is dominated by the sheer VR count (~19000) in a single BB, +not just live range lengths. Result: mixed, no consistent improvement. + +### Potential Solutions (not yet implemented) + +1. **Non-linear MIR estimate penalty**: When RA-expensive opcodes (SHL, SHR, + SAR, MUL, SIGNEXTEND, BYTE) exceed a count threshold (e.g., 64), add a + quadratic penalty to the MIR estimate. This pushes extreme patterns past + `MAX_JIT_MIR_ESTIMATE` while leaving normal contracts unaffected. + +2. **RA budget/timeout**: Add a compilation time or iteration budget to the + greedy RA. If exceeded, bail out and fallback to interpreter. This handles + ALL pathological patterns regardless of opcode type. + +3. **Function splitting**: Break the single large basic block into smaller + functions or compilation units at the MIR level, reducing per-unit RA cost. + +4. **DUP-level optimization**: In `handleDup`, when the same value has been + duplicated many times (e.g., >16), insert `protectUnsafeValue` to create + fresh copies. This wouldn't help b1's counter fan-out but might help + intermediate patterns. + +5. **Linear-scan RA for large functions**: Switch to a simpler O(n) register + allocator when the MIR instruction count exceeds a threshold. + +### Practical Consideration + +The b1 pattern (1023 consecutive DUPs followed by 1023 consecutive SHLs) is a +**synthetic benchmark** pattern. Real EVM contracts are unlikely to have such +extreme opcode concentration. The b0 pattern (interleaved DUP+SHL) is somewhat +more realistic and is already handled by the DUP detection fix. + +## Benchmark Evidence + +All measurements on Release build, codeSize=2087, mirEstimate=19485: + +| Case | Pattern | No fix | DUP detect (current) | +|---------|---------|--------|---------------------| +| SHL/b0 | DUP | ~78s | **6.0s** | +| SHL/b1 | full | ~132s | 132s (unchanged) | +| SHR/b0 | DUP | ~78s* | **5.3s** | +| SHR/b1 | full | ~114s | 114s (unchanged) | +| SAR/b0 | DUP | ~78s* | **2.8s** | +| SAR/b1 | full | ~57s | 57s (unchanged) | + +*Estimated from SHL/b0 baseline; exact measurements for SHR/SAR b0 without fix +were not captured separately. + +## All RA-Expensive Opcodes Analysis + +Beyond shift opcodes, other handlers also generate Select chains or heavy MIR +that could cause similar RA slowdowns at high density. + +### Select Chain Density per Handler + +| Handler | Select/call | Total MIR/call | Opcode | Weight | Risk | +|---------|-------------|----------------|--------|--------|------| +| handleLogicalRightShift | **96** | ~160-190 | SHR (0x1c) | 15 | **High** | +| handleLeftShift | **92** | ~150-180 | SHL (0x1b) | 15 | **High** | +| handleArithmeticRightShift | **52** | ~100-130 | SAR (0x1d) | 15 | **High** | +| handleSignextend | **21** | ~80-100 | SIGNEXTEND (0x0b) | 20 | **Medium** | +| handleExp (computeExpByteSize) | 7 | ~25-30 | EXP (0x0a) | 5 | Low | +| handleByte | 4 | ~25-35 | BYTE (0x1a) | 8 | Low | +| handleCompareGT_LT | 3 | ~25-30 | GT/LT/SGT/SLT | 12 | Low | +| handleMul | **0** | ~50-60 | MUL (0x02) | 80 | **Special** | + +### Key Observations + +1. **SHL/SHR/SAR (High risk)**: 52-96 Selects per call with nested dependency + chains (J loop + K loop over 4 components). The b0 DUP pattern is handled + by the implemented fix. Weight of 15 severely underestimates actual MIR + output (~150-190 instructions). + +2. **SIGNEXTEND (Medium risk)**: 21 Selects per call with two dependency chain + loops (SignBit chain + result component chain). Already has + `protectUnsafeValue` on result components, which partially mitigates the + issue. Could still be problematic with 500+ consecutive SIGNEXTEND ops. + Weight of 20 underestimates actual MIR (~80-100). + +3. **MUL (Special case)**: Zero Select chains, but generates heavy inline U256 + multiplication (~50-60 MIR via partial products, EvmUmul128, carry + propagation). The original `synth/MUL/b0` hanging case proved that **large + intermediate value fan-out causes RA explosion even without Select chains**. + Weight of 80 is the most accurate relative to actual MIR count. + +4. **BYTE, Compare, EXP (Low risk)**: Few Selects per call, unlikely to cause + issues even at moderate density. + +### Weight Accuracy + +| Opcode | Current Weight | Actual MIR/call | Ratio (actual/weight) | +|--------|---------------|-----------------|----------------------| +| SHL | 15 | ~150-180 | **10-12x** underestimated | +| SHR | 15 | ~160-190 | **10-13x** underestimated | +| SAR | 15 | ~100-130 | **7-9x** underestimated | +| SIGNEXTEND | 20 | ~80-100 | **4-5x** underestimated | +| MUL | 80 | ~50-60 | ~0.7x (slightly overestimated) | +| BYTE | 8 | ~25-35 | ~3-4x underestimated | + +Note: Weight underestimation alone doesn't cause problems — the RA cost is +superlinear, so the real issue is **opcode density** (hundreds of the same +expensive opcode in one function), not individual weight inaccuracy. + +### Generalizable Fix: DUP Detection + +The `Shift == Value` check (comparing `std::array` pointers) +can be generalized to any binary operation handler. When `OpA == OpB`, it means +both operands come from the same DUP'd stack value, creating a feedback loop +where the result cycles back as both inputs. This pattern is the primary cause +of RA explosion in the b0 (min_stack) benchmark variant. + +Candidates for generalization (if needed): +- `handleBinaryArithmetic` — already the most expensive; DUP pattern + would compound the cost +- `handleSignextend` — medium Select density, DUP pattern possible +- `handleBitwiseOp` — low individual cost but DUP pattern could amplify + +## Current State + +- **DUP pattern detection**: Implemented and verified for shift opcodes. + Handles b0 effectively (78s → 6s). +- **MIR weight**: SHL/SHR/SAR kept at 15 (linear estimate; underestimates + actual MIR by ~10x but weight accuracy is not the core issue). +- **MAX_JIT_MIR_ESTIMATE**: 50000 (b1's mirEstimate=19485 is below threshold). +- **b1 compilation**: Still slow (~57-132s) but completes; not addressed yet. +- **Other opcodes**: SIGNEXTEND has partial mitigation (existing + protectUnsafeValue). MUL is known problematic at high density. diff --git a/src/compiler/evm_frontend/evm_mir_compiler.cpp b/src/compiler/evm_frontend/evm_mir_compiler.cpp index cae05995..fea1f5c6 100644 --- a/src/compiler/evm_frontend/evm_mir_compiler.cpp +++ b/src/compiler/evm_frontend/evm_mir_compiler.cpp @@ -1776,7 +1776,8 @@ EVMMirBuilder::handleClz(const Operand &ValueOp) { EVMMirBuilder::U256Inst EVMMirBuilder::handleLeftShift(const U256Inst &Value, MInstruction *ShiftAmount, - MInstruction *IsLargeShift) { + MInstruction *IsLargeShift, + bool BreakLiveRanges) { MType *MirI64Type = EVMFrontendContext::getMIRTypeFromEVMType(EVMType::UINT64); U256Inst Result = {}; @@ -1836,6 +1837,11 @@ EVMMirBuilder::handleLeftShift(const U256Inst &Value, MInstruction *ShiftAmount, } SrcValue = createInstruction(false, MirI64Type, IsInBounds, SrcValue, Zero); + // When operands are identical (DUP pattern), break live ranges to prevent + // register allocation explosion from long dependency chains. + if (BreakLiveRanges) { + SrcValue = protectUnsafeValue(SrcValue, MirI64Type); + } // Calculate previous component index for carry bits // prev_idx = src_idx - 1 @@ -1886,6 +1892,9 @@ EVMMirBuilder::handleLeftShift(const U256Inst &Value, MInstruction *ShiftAmount, CarryValue = createInstruction( false, MirI64Type, IsMatch, CarryBits, CarryValue); } + if (BreakLiveRanges) { + CarryValue = protectUnsafeValue(CarryValue, MirI64Type); + } // Shift the source value left by the modulo amount // shifted_value = src_value << shift_mod @@ -1911,7 +1920,8 @@ EVMMirBuilder::handleLeftShift(const U256Inst &Value, MInstruction *ShiftAmount, EVMMirBuilder::U256Inst EVMMirBuilder::handleLogicalRightShift(const U256Inst &Value, MInstruction *ShiftAmount, - MInstruction *IsLargeShift) { + MInstruction *IsLargeShift, + bool BreakLiveRanges) { MType *MirI64Type = EVMFrontendContext::getMIRTypeFromEVMType(EVMType::UINT64); U256Inst Result = {}; @@ -1968,6 +1978,9 @@ EVMMirBuilder::handleLogicalRightShift(const U256Inst &Value, } SrcValue = createInstruction(false, MirI64Type, IsInBounds, SrcValue, Zero); + if (BreakLiveRanges) { + SrcValue = protectUnsafeValue(SrcValue, MirI64Type); + } // Calculate next component index for carry bits // next_idx = src_idx + 1 @@ -2014,6 +2027,9 @@ EVMMirBuilder::handleLogicalRightShift(const U256Inst &Value, CarryValue = createInstruction( false, MirI64Type, IsMatch, CarryBits, CarryValue); } + if (BreakLiveRanges) { + CarryValue = protectUnsafeValue(CarryValue, MirI64Type); + } // Shift the source value right by the modulo amount // shifted_value = src_value >> shift_mod @@ -2039,7 +2055,8 @@ EVMMirBuilder::handleLogicalRightShift(const U256Inst &Value, EVMMirBuilder::U256Inst EVMMirBuilder::handleArithmeticRightShift(const U256Inst &Value, MInstruction *ShiftAmount, - MInstruction *IsLargeShift) { + MInstruction *IsLargeShift, + bool BreakLiveRanges) { MType *MirI64Type = EVMFrontendContext::getMIRTypeFromEVMType(EVMType::UINT64); U256Inst Result = {}; @@ -2103,6 +2120,9 @@ EVMMirBuilder::handleArithmeticRightShift(const U256Inst &Value, } SrcValue = createInstruction( false, MirI64Type, IsInBounds, SrcValue, LargeShiftResult); + if (BreakLiveRanges) { + SrcValue = protectUnsafeValue(SrcValue, MirI64Type); + } // Calculate next component index for carry bits // next_idx = src_idx + 1 @@ -2148,6 +2168,9 @@ EVMMirBuilder::handleArithmeticRightShift(const U256Inst &Value, false, OP_shl, MirI64Type, NextValue, CarryShift); MInstruction *CarryValue = createInstruction( false, MirI64Type, HasShift, CarryBits, Zero); + if (BreakLiveRanges) { + CarryValue = protectUnsafeValue(CarryValue, MirI64Type); + } // Use logical right shift; sign extension is handled via LargeShiftResult. MInstruction *ShiftedValue = createInstruction( diff --git a/src/compiler/evm_frontend/evm_mir_compiler.h b/src/compiler/evm_frontend/evm_mir_compiler.h index 1d65080f..9f4b4331 100644 --- a/src/compiler/evm_frontend/evm_mir_compiler.h +++ b/src/compiler/evm_frontend/evm_mir_compiler.h @@ -342,6 +342,13 @@ class EVMMirBuilder final { U256Inst Shift = extractU256Operand(ShiftOp); U256Inst Value = extractU256Operand(ValueOp); + // Detect DUP pattern: when both operands originate from the same stack + // value (e.g., DUP1 SHL), the MIR pointers are identical. In this pattern + // the shift result feeds back as both operands of the next shift, creating + // extremely long live ranges that cause register allocation to explode. + // Insert spill points (protectUnsafeValue) to break the chains. + bool BreakLiveRanges = (Shift == Value); + // Check if shift amount >= 256 // (EVM spec: result is 0 for SHL/SHR, sign-extended for SAR) MInstruction *IsLargeShift = isU256GreaterOrEqual(Shift, 256); @@ -352,11 +359,14 @@ class EVMMirBuilder final { U256Inst Result = {}; if constexpr (Operator == BinaryOperator::BO_SHL) { - Result = handleLeftShift(Value, ShiftAmount, IsLargeShift); + Result = handleLeftShift(Value, ShiftAmount, IsLargeShift, + BreakLiveRanges); } else if constexpr (Operator == BinaryOperator::BO_SHR_U) { - Result = handleLogicalRightShift(Value, ShiftAmount, IsLargeShift); + Result = handleLogicalRightShift(Value, ShiftAmount, IsLargeShift, + BreakLiveRanges); } else if constexpr (Operator == BinaryOperator::BO_SHR_S) { - Result = handleArithmeticRightShift(Value, ShiftAmount, IsLargeShift); + Result = handleArithmeticRightShift(Value, ShiftAmount, IsLargeShift, + BreakLiveRanges); } return Operand(Result, EVMType::UINT256); @@ -559,15 +569,18 @@ class EVMMirBuilder final { CompareOperator Operator); U256Inst handleLeftShift(const U256Inst &Value, MInstruction *ShiftAmount, - MInstruction *IsLargeShift); + MInstruction *IsLargeShift, + bool BreakLiveRanges = false); U256Inst handleLogicalRightShift(const U256Inst &Value, MInstruction *ShiftAmount, - MInstruction *IsLargeShift); + MInstruction *IsLargeShift, + bool BreakLiveRanges = false); U256Inst handleArithmeticRightShift(const U256Inst &Value, MInstruction *ShiftAmount, - MInstruction *IsLargeShift); + MInstruction *IsLargeShift, + bool BreakLiveRanges = false); // Helper functions for inline U256 multiplication MInstruction *createEvmUmul128(MInstruction *LHS, MInstruction *RHS); From 15cd9dd877ad11aaa628bfcaa7396f9bad3d7a9c Mon Sep 17 00:00:00 2001 From: cl507523 Date: Wed, 11 Feb 2026 06:50:23 +0000 Subject: [PATCH 2/2] fix: code format --- src/compiler/evm_frontend/evm_mir_compiler.cpp | 16 ++++++---------- src/compiler/evm_frontend/evm_mir_compiler.h | 4 ++-- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/compiler/evm_frontend/evm_mir_compiler.cpp b/src/compiler/evm_frontend/evm_mir_compiler.cpp index fea1f5c6..b00edc6b 100644 --- a/src/compiler/evm_frontend/evm_mir_compiler.cpp +++ b/src/compiler/evm_frontend/evm_mir_compiler.cpp @@ -1917,11 +1917,9 @@ EVMMirBuilder::handleLeftShift(const U256Inst &Value, MInstruction *ShiftAmount, return Result; } -EVMMirBuilder::U256Inst -EVMMirBuilder::handleLogicalRightShift(const U256Inst &Value, - MInstruction *ShiftAmount, - MInstruction *IsLargeShift, - bool BreakLiveRanges) { +EVMMirBuilder::U256Inst EVMMirBuilder::handleLogicalRightShift( + const U256Inst &Value, MInstruction *ShiftAmount, + MInstruction *IsLargeShift, bool BreakLiveRanges) { MType *MirI64Type = EVMFrontendContext::getMIRTypeFromEVMType(EVMType::UINT64); U256Inst Result = {}; @@ -2052,11 +2050,9 @@ EVMMirBuilder::handleLogicalRightShift(const U256Inst &Value, return Result; } -EVMMirBuilder::U256Inst -EVMMirBuilder::handleArithmeticRightShift(const U256Inst &Value, - MInstruction *ShiftAmount, - MInstruction *IsLargeShift, - bool BreakLiveRanges) { +EVMMirBuilder::U256Inst EVMMirBuilder::handleArithmeticRightShift( + const U256Inst &Value, MInstruction *ShiftAmount, + MInstruction *IsLargeShift, bool BreakLiveRanges) { MType *MirI64Type = EVMFrontendContext::getMIRTypeFromEVMType(EVMType::UINT64); U256Inst Result = {}; diff --git a/src/compiler/evm_frontend/evm_mir_compiler.h b/src/compiler/evm_frontend/evm_mir_compiler.h index 9f4b4331..057a5bd6 100644 --- a/src/compiler/evm_frontend/evm_mir_compiler.h +++ b/src/compiler/evm_frontend/evm_mir_compiler.h @@ -359,8 +359,8 @@ class EVMMirBuilder final { U256Inst Result = {}; if constexpr (Operator == BinaryOperator::BO_SHL) { - Result = handleLeftShift(Value, ShiftAmount, IsLargeShift, - BreakLiveRanges); + Result = + handleLeftShift(Value, ShiftAmount, IsLargeShift, BreakLiveRanges); } else if constexpr (Operator == BinaryOperator::BO_SHR_U) { Result = handleLogicalRightShift(Value, ShiftAmount, IsLargeShift, BreakLiveRanges);