Skip to content
Merged
46 changes: 45 additions & 1 deletion ext/AdaptiveArrayPoolsCUDAExt/acquire.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
# - Could track recent N sizes to make smarter decisions (avoid shrink if sizes fluctuate)
# ==============================================================================

using AdaptiveArrayPools: get_view!, get_nd_view!, get_nd_array!, allocate_vector, safe_prod
using AdaptiveArrayPools: get_view!, get_nd_view!, get_nd_array!, allocate_vector, safe_prod,
_mark_untracked!, _fixed_slot_bit, _checkpoint_typed_pool!

"""
get_view!(tp::CuTypedPool{T}, n::Int) -> CuVector{T}
Expand Down Expand Up @@ -162,3 +163,46 @@ Used by `unsafe_acquire!` - same zero-allocation behavior as `acquire!`.
@inline function AdaptiveArrayPools.get_nd_array!(tp::CuTypedPool{T}, dims::NTuple{N, Int}) where {T, N}
return get_view!(tp, dims)
end

# ==============================================================================
# CUDA _mark_untracked! override (Issue #2 / #2a fix)
# ==============================================================================
# Float16 on CUDA: direct struct field with _fixed_slot_bit(Float16)=0.
# We track Float16 via bit 7 (CUDA reassignment; CPU uses bit 7 for Bit type, absent on GPU).
# This gives Float16 lazy first-touch checkpointing in bit-14 (typed lazy) and bit-15 (dynamic)
# modes, ensuring Case A (not Case B) fires at rewind and parent n_active is preserved.

@inline function AdaptiveArrayPools._mark_untracked!(pool::CuAdaptiveArrayPool, ::Type{T}) where {T}
depth = pool._current_depth
b = _fixed_slot_bit(T)
if b == UInt16(0)
if T === Float16
# Float16: CUDA direct field tracked via bit 7 (not in pool.others dict).
b16 = UInt16(1) << 7
current_mask = @inbounds pool._untracked_fixed_masks[depth]
# Lazy first-touch checkpoint: bit 14 (typed lazy) OR bit 15 (dynamic), first touch only.
# Guard: skip if already checkpointed at this depth (prevents double-push).
if (current_mask & 0xC000) != 0 && (current_mask & b16) == 0
if @inbounds(pool.float16._checkpoint_depths[end]) != depth
_checkpoint_typed_pool!(pool.float16, depth)
end
end
@inbounds pool._untracked_fixed_masks[depth] = current_mask | b16
else
# Genuine others type (UInt8, Int8, etc.) — eagerly snapshotted at scope entry.
@inbounds pool._untracked_has_others[depth] = true
end
else
current_mask = @inbounds pool._untracked_fixed_masks[depth]
# Lazy first-touch checkpoint for fixed-slot types in bit 14/15 modes.
# Guard: skip if already checkpointed at this depth (prevents double-push).
if (current_mask & 0xC000) != 0 && (current_mask & b) == 0
tp = AdaptiveArrayPools.get_typed_pool!(pool, T)
if @inbounds(tp._checkpoint_depths[end]) != depth
_checkpoint_typed_pool!(tp, depth)
end
end
@inbounds pool._untracked_fixed_masks[depth] = current_mask | b
end
nothing
end
113 changes: 112 additions & 1 deletion ext/AdaptiveArrayPoolsCUDAExt/state.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# AbstractTypedPool, so they work for CuTypedPool automatically.

using AdaptiveArrayPools: checkpoint!, rewind!, reset!,
_checkpoint_typed_pool!, _rewind_typed_pool!
_checkpoint_typed_pool!, _rewind_typed_pool!, _has_bit

# ==============================================================================
# GPU Fixed Slot Iteration
Expand Down Expand Up @@ -147,6 +147,117 @@ end
end
end

# ==============================================================================
# Dynamic-Selective Mode for CuAdaptiveArrayPool (use_typed=false path)
# ==============================================================================
# Mirrors CPU _depth_only_checkpoint! / _dynamic_selective_rewind! in src/state.jl.
#
# Float16 on CUDA: direct struct field (not in pool.others dict), but _fixed_slot_bit(Float16)=0.
# We reassign Float16 to bit 7 (unused on CUDA; CPU uses bit 7 for Bit type which has no GPU equivalent).
# This gives Float16 the same lazy-first-touch checkpoint treatment as other fixed-slot types,
# avoiding the unsafe unconditional-rewind issue (Option B) and the has_others confusion.

# Bit 7 on CUDA is reserved for Float16 (CPU uses it for Bit; Bit type does not exist on GPU).
@inline _cuda_float16_bit() = UInt16(1) << 7

@inline function AdaptiveArrayPools._depth_only_checkpoint!(pool::CuAdaptiveArrayPool)
pool._current_depth += 1
push!(pool._untracked_fixed_masks, UInt16(0x8000)) # bit 15: dynamic-selective mode
push!(pool._untracked_has_others, false)
depth = pool._current_depth
# Eagerly checkpoint pre-existing others entries — same as CPU _depth_only_checkpoint!.
# New types created during the scope start at n_active=0 (sentinel covers them, Case B safe).
# Pre-existing types need their count saved now so Case A fires correctly at rewind.
for p in values(pool.others)
_checkpoint_typed_pool!(p, depth)
@inbounds pool._untracked_has_others[depth] = true
end
# Float16 uses lazy first-touch via bit 7 in _mark_untracked! — no eager checkpoint needed.
nothing
end

@inline function AdaptiveArrayPools._dynamic_selective_rewind!(pool::CuAdaptiveArrayPool)
d = pool._current_depth
mask = @inbounds(pool._untracked_fixed_masks[d]) & UInt16(0x00FF)
_has_bit(mask, Float64) && _rewind_typed_pool!(pool.float64, d)
_has_bit(mask, Float32) && _rewind_typed_pool!(pool.float32, d)
_has_bit(mask, Int64) && _rewind_typed_pool!(pool.int64, d)
_has_bit(mask, Int32) && _rewind_typed_pool!(pool.int32, d)
_has_bit(mask, ComplexF64) && _rewind_typed_pool!(pool.complexf64, d)
_has_bit(mask, ComplexF32) && _rewind_typed_pool!(pool.complexf32, d)
_has_bit(mask, Bool) && _rewind_typed_pool!(pool.bool, d)
# Bit 7: Float16 (CUDA reassignment — _fixed_slot_bit(Float16)==0, must use explicit bit check)
mask & _cuda_float16_bit() != 0 && _rewind_typed_pool!(pool.float16, d)
if @inbounds(pool._untracked_has_others[d])
for tp in values(pool.others)
_rewind_typed_pool!(tp, d)
end
end
pop!(pool._untracked_fixed_masks)
pop!(pool._untracked_has_others)
pool._current_depth -= 1
nothing
end

# ==============================================================================
# Typed-Fallback Helpers for CuAdaptiveArrayPool (Phase 5 parity)
# ==============================================================================

# _typed_checkpoint_with_lazy!: typed checkpoint + set bit 14 for lazy extra-type tracking.
# Also eagerly snapshots pre-existing others entries (mirrors CPU fix for Issue #3).
@inline function AdaptiveArrayPools._typed_checkpoint_with_lazy!(pool::CuAdaptiveArrayPool, types::Type...)
checkpoint!(pool, types...)
d = pool._current_depth
@inbounds pool._untracked_fixed_masks[d] |= UInt16(0x4000) # set bit 14
# Eagerly snapshot pre-existing others entries — same reasoning as _depth_only_checkpoint!.
# Skip re-snapshot for entries already checkpointed at d by checkpoint!(pool, types...)
# (e.g. Float16 in types... was just checkpointed above — avoid double-push).
for p in values(pool.others)
if @inbounds(p._checkpoint_depths[end]) != d
_checkpoint_typed_pool!(p, d)
end
@inbounds pool._untracked_has_others[d] = true
end
# Float16 uses lazy first-touch via bit 7 in _mark_untracked! — no eager checkpoint needed.
nothing
end

# _typed_selective_rewind!: selective rewind of (tracked | untracked) mask.
# Uses direct field access with bit checks — foreach_fixed_slot is single-argument (no bit yield).
# Bit 7: Float16 (CUDA-specific; lazy-checkpointed on first touch by _mark_untracked!).
# has_others: genuine others types (UInt8, Int8, etc.) — eagerly checkpointed at scope entry.
@inline function AdaptiveArrayPools._typed_selective_rewind!(pool::CuAdaptiveArrayPool, tracked_mask::UInt16)
d = pool._current_depth
untracked = @inbounds(pool._untracked_fixed_masks[d]) & UInt16(0x00FF)
combined = tracked_mask | untracked
_has_bit(combined, Float64) && _rewind_typed_pool!(pool.float64, d)
_has_bit(combined, Float32) && _rewind_typed_pool!(pool.float32, d)
_has_bit(combined, Int64) && _rewind_typed_pool!(pool.int64, d)
_has_bit(combined, Int32) && _rewind_typed_pool!(pool.int32, d)
_has_bit(combined, ComplexF64) && _rewind_typed_pool!(pool.complexf64, d)
_has_bit(combined, ComplexF32) && _rewind_typed_pool!(pool.complexf32, d)
_has_bit(combined, Bool) && _rewind_typed_pool!(pool.bool, d)
# Float16: bit 7 is set by _mark_untracked! on first untracked touch (lazy first-touch).
# Also rewind when Float16 was a *tracked* type in the macro: _typed_checkpoint_with_lazy!
# calls checkpoint!(pool, Float16) which pushes a checkpoint at depth d, but _acquire_impl!
# (macro transform) bypasses _mark_untracked!, leaving bit 7 = 0.
# _tracked_mask_for_types(Float16) == 0 (since _fixed_slot_bit(Float16) == 0), so
# tracked_mask carries no bit for Float16 either.
# Solution: check _checkpoint_depths to detect "Float16 was checkpointed at this depth".
if combined & _cuda_float16_bit() != 0 || @inbounds(pool.float16._checkpoint_depths[end]) == d
_rewind_typed_pool!(pool.float16, d)
end
if @inbounds(pool._untracked_has_others[d])
for tp in values(pool.others)
_rewind_typed_pool!(tp, d)
end
end
pop!(pool._untracked_fixed_masks)
pop!(pool._untracked_has_others)
pool._current_depth -= 1
nothing
end

# ==============================================================================
# reset! for CuAdaptiveArrayPool
# ==============================================================================
Expand Down
28 changes: 28 additions & 0 deletions src/acquire.jl
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,34 @@ For non-fixed-slot types, sets `_untracked_has_others` flag.
nothing
end

# CPU-specific override: adds lazy first-touch checkpoint in dynamic-selective mode
# and typed-lazy mode.
# Bit 15 of _untracked_fixed_masks[depth] == 1 ↔ depth entered via _depth_only_checkpoint!
# Bit 14 of _untracked_fixed_masks[depth] == 1 ↔ depth entered via _typed_checkpoint_with_lazy!
# On the first acquire of each fixed-slot type T at that depth, we retroactively save
# n_active BEFORE the acquire (current value is still the parent's count), so that
# the subsequent rewind can restore the parent's state correctly.
@inline function _mark_untracked!(pool::AdaptiveArrayPool, ::Type{T}) where {T}
depth = pool._current_depth
b = _fixed_slot_bit(T)
if b == UInt16(0)
@inbounds pool._untracked_has_others[depth] = true
else
current_mask = @inbounds pool._untracked_fixed_masks[depth]
# Lazy checkpoint: dynamic mode (bit 15) OR typed lazy mode (bit 14), AND first touch.
# Guard: skip if already checkpointed at this depth (prevents double-push when a
# tracked type is also acquired by a helper via acquire! → _mark_untracked!).
if (current_mask & 0xC000) != 0 && (current_mask & b) == 0
tp = get_typed_pool!(pool, T)
if @inbounds(tp._checkpoint_depths[end]) != depth
_checkpoint_typed_pool!(tp, depth)
end
end
@inbounds pool._untracked_fixed_masks[depth] = current_mask | b
end
nothing
end

# ==============================================================================
# Internal Implementation Functions (called by macro-transformed code)
# ==============================================================================
Expand Down
Loading