Skip to content
17 changes: 17 additions & 0 deletions .codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
coverage:
status:
project:
default:
target: 95%
threshold: 1%
patch:
default:
target: 95%

ignore:
- "ext/**/*"

comment:
layout: "reach,diff,flags,files"
behavior: default
require_changes: true
88 changes: 50 additions & 38 deletions docs/src/architecture/macro-internals.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,50 +115,56 @@ end
# If only checkpoint!(pool, Int64), Float64 arrays won't be rewound!
```

### The Solution: `_untracked_flags`
### The Solution: Bitmask-Based Untracked Tracking

Every `acquire!` call (and convenience functions) marks itself as "untracked":
Every `acquire!` call (and convenience functions) marks itself as "untracked" with type-specific bitmask information:

```julia
# Public API (called from user code outside macro)
@inline function acquire!(pool, ::Type{T}, n::Int) where {T}
_mark_untracked!(pool) # ← Sets flag!
_mark_untracked!(pool, T) # ← Sets type-specific bitmask!
_acquire_impl!(pool, T, n)
end

# Macro-transformed calls skip the marking
# (because macro already knows about them)
_acquire_impl!(pool, T, n) # ← No flag
_acquire_impl!(pool, T, n) # ← No marking
```

Each fixed-slot type maps to a bit in a `UInt16` bitmask via `_fixed_slot_bit(T)`.
Non-fixed-slot types set a separate `_untracked_has_others` flag.

### Flow Diagram

```
@with_pool pool begin State of pool._untracked_flags
│ ─────────────────────────────────
├─► checkpoint!(pool, Int64) depth=2, flag[2]=false
@with_pool pool begin Bitmask state at depth 2
│ ─────────────────────────────
├─► checkpoint!(pool, Int64) masks[2]=0x0000, others[2]=false
│ A = _acquire_impl!(...) (macro-transformed, no flag set)
│ A = _acquire_impl!(...) (macro-transformed, no mark)
│ B = helper!(pool)
│ └─► zeros!(pool, Float64, N)
│ └─► _mark_untracked!(pool) flag[2]=TRUE ←──┐
... more code ...
└─► rewind! check:
if pool._untracked_flags[2] ─────────────────────────┘
rewind!(pool) # Full rewind (safe)
else
│ └─► _mark_untracked!(pool, Float64)
masks[2] |= 0x0001 (Float64 bit) ←───┐
... more code ...
└─► rewind! check: │
tracked_mask = _tracked_mask_for_types(Int64)
if _can_use_typed_path(pool, tracked_mask) ────────┘
rewind!(pool, Int64) # Typed rewind (fast)
else # Float64 not in {Int64} → full
rewind!(pool) # Full rewind (safe)
end
end
```

### Why This Works

1. **Macro-tracked calls**: Transformed to `_acquire_impl!` → no flag → typed rewind
2. **Untracked calls**: Use public API → sets flag → triggers full rewind
3. **Result**: Always safe, with optimization when possible
1. **Macro-tracked calls**: Transformed to `_acquire_impl!` → no bitmask mark → typed path
2. **Untracked calls**: Use public API → sets type-specific bitmask → subset check at rewind
3. **Subset optimization**: If untracked types are a subset of tracked types, the typed path is still safe
4. **Result**: Always safe, with finer-grained optimization than a single boolean flag

## Nested `@with_pool` Handling

Expand All @@ -170,27 +176,29 @@ Each `@with_pool` maintains its own checkpoint depth:
├─► @with_pool p2 begin depth: 2 → 3
│ v2 = acquire!(p2, Int64, 5)
│ helper!(p2) # sets flag[3]=true
│ helper!(p2) # marks bitmask at depth 3
│ sum(v2)
│ end depth: 3 → 2, flag[3] checked
│ end depth: 3 → 2, bitmask checked
│ # v1 still valid here!
sum(v1)
end depth: 2 → 1, flag[2] checked
end depth: 2 → 1, bitmask checked
```

### Depth Tracking Data Structures

```julia
struct AdaptiveArrayPool
# ... type pools ...
_current_depth::Int # Current scope depth (1 = global)
_untracked_flags::Vector{Bool} # Per-depth flag array
_current_depth::Int # Current scope depth (1 = global)
_untracked_fixed_masks::Vector{UInt16} # Per-depth: which fixed slots untracked
_untracked_has_others::Vector{Bool} # Per-depth: any non-fixed-slot untracked
end

# Initialized with sentinel:
_current_depth = 1 # Global scope
_untracked_flags = [false] # Sentinel for depth=1
_current_depth = 1 # Global scope
_untracked_fixed_masks = [UInt16(0)] # Sentinel for depth=1
_untracked_has_others = [false] # Sentinel for depth=1
```

## Performance Impact
Expand All @@ -199,9 +207,12 @@ _untracked_flags = [false] # Sentinel for depth=1
|----------|-------------------|----------------|
| 1 type, no untracked | `checkpoint!(pool, T)` | **~77% faster** |
| Multiple types, no untracked | `checkpoint!(pool, T1, T2, ...)` | **~50% faster** |
| Any untracked acquire | `checkpoint!(pool)` | Baseline |
| Untracked subset of tracked | `checkpoint!(pool, T...)` | **~77% faster** |
| Unknown untracked types | `checkpoint!(pool)` | Baseline |

The optimization matters most in tight loops with many iterations.
The optimization matters most in tight loops with many iterations. The bitmask subset
check allows the typed path even when untracked acquires occur, as long as those types
are already covered by the macro's tracked set.

## Code Generation Summary

Expand All @@ -217,23 +228,22 @@ end
function compute(data)
pool = get_task_local_pool()

# Check if parent scope had untracked (for nested pools)
if pool._untracked_flags[pool._current_depth]
checkpoint!(pool) # Full checkpoint
# Bitmask subset check: can typed path handle any untracked acquires?
if _can_use_typed_path(pool, _tracked_mask_for_types(Float64))
checkpoint!(pool, Float64) # Typed checkpoint (fast)
else
checkpoint!(pool, Float64) # Typed checkpoint
checkpoint!(pool) # Full checkpoint (safe)
end

try
A = _acquire_impl!(pool, Float64, length(data))
result = helper!(pool, A)
return result
finally
# Check if untracked acquires occurred in this scope
if pool._untracked_flags[pool._current_depth]
rewind!(pool) # Full rewind
if _can_use_typed_path(pool, _tracked_mask_for_types(Float64))
rewind!(pool, Float64) # Typed rewind (fast)
else
rewind!(pool, Float64) # Typed rewind
rewind!(pool) # Full rewind (safe)
end
end
end
Expand All @@ -246,8 +256,10 @@ end
| `_extract_acquire_types(expr, pool_name)` | AST walk to find types |
| `_filter_static_types(types, local_vars)` | Filter out locally-defined types |
| `_transform_acquire_calls(expr, pool_name)` | Replace `acquire!` → `_acquire_impl!` |
| `_mark_untracked!(pool)` | Set untracked flag for current depth |
| `_generate_typed_checkpoint_call(pool, types)` | Generate `checkpoint!(pool, T...)` |
| `_mark_untracked!(pool, T)` | Set type-specific bitmask for current depth |
| `_can_use_typed_path(pool, mask)` | Bitmask subset check for typed vs full path |
| `_tracked_mask_for_types(T...)` | Compile-time bitmask for tracked types |
| `_generate_typed_checkpoint_call(pool, types)` | Generate bitmask-aware checkpoint |

## See Also

Expand Down
58 changes: 42 additions & 16 deletions ext/AdaptiveArrayPoolsCUDAExt/state.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,10 @@ end
# ==============================================================================

function AdaptiveArrayPools.checkpoint!(pool::CuAdaptiveArrayPool)
# Increment depth and initialize untracked flag
# Increment depth and initialize untracked bitmask state
pool._current_depth += 1
push!(pool._untracked_flags, false)
push!(pool._untracked_fixed_masks, UInt16(0))
push!(pool._untracked_has_others, false)
depth = pool._current_depth

# Fixed slots - zero allocation via @generated iteration
Expand All @@ -52,17 +53,27 @@ end
# Type-specific checkpoint (single type)
@inline function AdaptiveArrayPools.checkpoint!(pool::CuAdaptiveArrayPool, ::Type{T}) where {T}
pool._current_depth += 1
push!(pool._untracked_flags, false)
push!(pool._untracked_fixed_masks, UInt16(0))
push!(pool._untracked_has_others, false)
_checkpoint_typed_pool!(AdaptiveArrayPools.get_typed_pool!(pool, T), pool._current_depth)
nothing
end

# Type-specific checkpoint (multiple types)
@generated function AdaptiveArrayPools.checkpoint!(pool::CuAdaptiveArrayPool, types::Type...)
checkpoint_exprs = [:(_checkpoint_typed_pool!(AdaptiveArrayPools.get_typed_pool!(pool, types[$i]), pool._current_depth)) for i in 1:length(types)]
seen = Set{Any}()
unique_indices = Int[]
for i in eachindex(types)
if !(types[i] in seen)
push!(seen, types[i])
push!(unique_indices, i)
end
end
checkpoint_exprs = [:(_checkpoint_typed_pool!(AdaptiveArrayPools.get_typed_pool!(pool, types[$i]), pool._current_depth)) for i in unique_indices]
quote
pool._current_depth += 1
push!(pool._untracked_flags, false)
push!(pool._untracked_fixed_masks, UInt16(0))
push!(pool._untracked_has_others, false)
$(checkpoint_exprs...)
nothing
end
Expand Down Expand Up @@ -91,7 +102,8 @@ function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool)
_rewind_typed_pool!(tp, cur_depth)
end

pop!(pool._untracked_flags)
pop!(pool._untracked_fixed_masks)
pop!(pool._untracked_has_others)
pool._current_depth -= 1

return nothing
Expand All @@ -104,22 +116,32 @@ end
return nothing
end
_rewind_typed_pool!(AdaptiveArrayPools.get_typed_pool!(pool, T), pool._current_depth)
pop!(pool._untracked_flags)
pop!(pool._untracked_fixed_masks)
pop!(pool._untracked_has_others)
pool._current_depth -= 1
nothing
end

# Type-specific rewind (multiple types)
@generated function AdaptiveArrayPools.rewind!(pool::CuAdaptiveArrayPool, types::Type...)
rewind_exprs = [:(_rewind_typed_pool!(AdaptiveArrayPools.get_typed_pool!(pool, types[$i]), pool._current_depth)) for i in length(types):-1:1]
reset_exprs = [:(reset!(AdaptiveArrayPools.get_typed_pool!(pool, types[$i]))) for i in 1:length(types)]
seen = Set{Any}()
unique_indices = Int[]
for i in eachindex(types)
if !(types[i] in seen)
push!(seen, types[i])
push!(unique_indices, i)
end
end
rewind_exprs = [:(_rewind_typed_pool!(AdaptiveArrayPools.get_typed_pool!(pool, types[$i]), pool._current_depth)) for i in reverse(unique_indices)]
reset_exprs = [:(reset!(AdaptiveArrayPools.get_typed_pool!(pool, types[$i]))) for i in unique_indices]
quote
if pool._current_depth == 1
$(reset_exprs...)
return nothing
end
$(rewind_exprs...)
pop!(pool._untracked_flags)
pop!(pool._untracked_fixed_masks)
pop!(pool._untracked_has_others)
pool._current_depth -= 1
nothing
end
Expand All @@ -140,10 +162,12 @@ function AdaptiveArrayPools.reset!(pool::CuAdaptiveArrayPool)
reset!(tp)
end

# Reset untracked detection state
# Reset depth and bitmask sentinel state
pool._current_depth = 1
empty!(pool._untracked_flags)
push!(pool._untracked_flags, false)
empty!(pool._untracked_fixed_masks)
push!(pool._untracked_fixed_masks, UInt16(0)) # Sentinel: no bits set
empty!(pool._untracked_has_others)
push!(pool._untracked_has_others, false) # Sentinel: no others

return pool
end
Expand Down Expand Up @@ -197,10 +221,12 @@ function Base.empty!(pool::CuAdaptiveArrayPool)
end
empty!(pool.others)

# Reset state
# Reset depth and bitmask sentinel state
pool._current_depth = 1
empty!(pool._untracked_flags)
push!(pool._untracked_flags, false)
empty!(pool._untracked_fixed_masks)
push!(pool._untracked_fixed_masks, UInt16(0)) # Sentinel: no bits set
empty!(pool._untracked_has_others)
push!(pool._untracked_has_others, false) # Sentinel: no others

return pool
end
6 changes: 4 additions & 2 deletions ext/AdaptiveArrayPoolsCUDAExt/types.jl
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ mutable struct CuAdaptiveArrayPool <: AbstractArrayPool

# State management (same as CPU)
_current_depth::Int
_untracked_flags::Vector{Bool}
_untracked_fixed_masks::Vector{UInt16} # Per-depth: which fixed slots had untracked acquires
_untracked_has_others::Vector{Bool} # Per-depth: any non-fixed-slot untracked acquire?

# Device tracking (safety)
device_id::Int
Expand All @@ -131,7 +132,8 @@ function CuAdaptiveArrayPool()
CuTypedPool{Bool}(),
IdDict{DataType, Any}(),
1, # _current_depth (1 = global scope)
[false], # _untracked_flags sentinel
[UInt16(0)], # _untracked_fixed_masks: sentinel (no bits set)
[false], # _untracked_has_others: sentinel (no others)
CUDA.deviceid(dev) # Use public API
)
end
Loading