diff --git a/examples/xegpu_matmul/lit.local.cfg b/examples/xegpu_matmul/lit.local.cfg deleted file mode 100644 index b310830..0000000 --- a/examples/xegpu_matmul/lit.local.cfg +++ /dev/null @@ -1 +0,0 @@ -config.excludes = ["mlir_utils.py", "payload.py", "runner.py", "schedule.py"] diff --git a/examples/xegpu_matmul/matmul.py b/examples/xegpu_matmul/matmul.py index 277f9c5..8a2dd9c 100644 --- a/examples/xegpu_matmul/matmul.py +++ b/examples/xegpu_matmul/matmul.py @@ -14,7 +14,6 @@ import numpy as np from mlir import ir from mlir.runtime.np_to_memref import ( - get_ranked_memref_descriptor, make_nd_memref_descriptor, as_ctype, ) @@ -22,15 +21,9 @@ from lighthouse.workload import Workload, benchmark from lighthouse.utils.memref import get_packed_arg, to_ctype as memref_to_ctype - -# Import from sibling files: -from schedule import get_schedule_module -from payload import generate_matmul_payload - - -def numpy_to_ctype(arr: np.ndarray) -> ctypes._Pointer: - """Convert numpy array to memref and ctypes **void pointer.""" - return memref_to_ctype(get_ranked_memref_descriptor(arr)) +from lighthouse.utils.numpy import numpy_to_ctype +from lighthouse.schedule.xegpu.matmul_schedule import get_schedule_module +from lighthouse.ingress.gpu import generate_matmul_payload class XeGPUMatMul(Workload): @@ -54,6 +47,7 @@ def __init__( c_type: str = "f32", has_bias: bool = False, has_relu: bool = False, + accumulate_c: bool = True, ): self.M = M self.N = N @@ -73,6 +67,7 @@ def __init__( self.c_dtype = type_str_to_numpy[c_type] self.has_bias = has_bias self.has_relu = has_relu + self.accumulate_c = accumulate_c if has_bias: raise NotImplementedError("Bias is not implemented yet") # cache allocated memrefs @@ -136,7 +131,9 @@ def _reference_solution(self) -> np.ndarray: A, B, C = self._initial_host_arrays # use float32 data type for efficiency f32 = np.float32 - C_ref = A.astype(f32) @ B.astype(f32) + C.astype(f32) + C_ref = A.astype(f32) @ B.astype(f32) + if self.accumulate_c: + C_ref += C.astype(f32) if self.has_relu: C_ref = np.maximum(C_ref, 0) if self.has_bias: @@ -196,6 +193,10 @@ def get_complexity(self) -> tuple[int, int, int]: nbytes_ab = np.dtype(self.ab_dtype).itemsize nbytes_c = np.dtype(self.c_dtype).itemsize memory_reads = (M * K + K * N) * nbytes_ab # read A and B + if self.accumulate_c: + memory_reads += M * N * nbytes_c # read C for accumulation + if self.has_bias: + memory_reads += N * nbytes_c # read bias memory_writes = M * N * nbytes_c # write C return (flop_count, memory_reads, memory_writes) @@ -209,6 +210,7 @@ def payload_module(self) -> ir.Module: c_type_str=self.c_type, has_bias=self.has_bias, has_relu=self.has_relu, + accumulate_c=self.accumulate_c, ) return mod @@ -218,8 +220,11 @@ def schedule_module( return get_schedule_module( has_bias=self.has_bias, has_relu=self.has_relu, + has_convert_c=False, + accumulate_c=self.accumulate_c, stop_at_stage=stop_at_stage, - params=parameters, + nlayers=1, + params={"layer_0": parameters}, ) def shared_libs(self) -> list[str]: @@ -309,6 +314,11 @@ def parse_cli(): action="store_true", help="Add relu op after the matrix multiplication (and bias if any).", ) + parser.add_argument( + "--no-accumulate-c", + action="store_true", + help="Compute plain matrix-multiply C=A*B instead of matrix-multiply-accumulate C+=A*B.", + ) parser.add_argument( "--check-result", action="store_true", @@ -342,20 +352,20 @@ def parse_cli(): args = parse_cli() params = { - "auto_wg_d0": args.wg_tile[0], - "auto_wg_d1": args.wg_tile[1], - "auto_sg_d0": args.sg_tile[0], - "auto_sg_d1": args.sg_tile[1], - "auto_k": args.k_tile, - "auto_load_a_d0": args.load_tile_a[0], - "auto_load_a_d1": args.load_tile_a[1], - "auto_load_b_d0": args.load_tile_b[0], - "auto_load_b_d1": args.load_tile_b[1], - "auto_prefetch_a_d0": args.prefetch_tile_a[0], - "auto_prefetch_a_d1": args.prefetch_tile_a[1], - "auto_prefetch_b_d0": args.prefetch_tile_b[0], - "auto_prefetch_b_d1": args.prefetch_tile_b[1], - "auto_nb_prefetch": args.nb_prefetch, + "wg_m": args.wg_tile[0], + "wg_n": args.wg_tile[1], + "sg_m": args.sg_tile[0], + "sg_n": args.sg_tile[1], + "k": args.k_tile, + "load_a_m": args.load_tile_a[0], + "load_a_k": args.load_tile_a[1], + "load_b_k": args.load_tile_b[0], + "load_b_n": args.load_tile_b[1], + "pf_a_m": args.prefetch_tile_a[0], + "pf_a_k": args.prefetch_tile_a[1], + "pf_b_k": args.prefetch_tile_b[0], + "pf_b_n": args.prefetch_tile_b[1], + "pf_nb": args.nb_prefetch, } M, N, K = args.sizes @@ -371,6 +381,7 @@ def parse_cli(): c_type=c_type, has_bias=False, has_relu=args.relu, + accumulate_c=not args.no_accumulate_c, ) if args.dump_kernel or args.dump_schedule: diff --git a/examples/xegpu_matmul/payload.py b/examples/xegpu_matmul/payload.py deleted file mode 100644 index 0cf3a45..0000000 --- a/examples/xegpu_matmul/payload.py +++ /dev/null @@ -1,124 +0,0 @@ -from mlir import ir -from mlir.dialects import func, linalg, gpu, bufferization, arith, tensor - - -def emit_gpu_alloc(suffix: str, element_type: ir.Type, rank: int = 2): - dyn = ir.ShapedType.get_dynamic_size() - memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type) - index_t = ir.IndexType.get() - i32_t = ir.IntegerType.get_signless(32) - inputs = rank * (i32_t,) - - @func.func(*inputs, name="gpu_alloc_" + suffix) - def alloc_func(*shape): - dims = [arith.index_cast(index_t, a) for a in shape] - alloc = gpu.alloc(memref_dyn_t, None, [], dims, []) - return alloc - - alloc_func.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() - - -def emit_gpu_dealloc(suffix: str, element_type: ir.Type, rank: int = 2): - dyn = ir.ShapedType.get_dynamic_size() - memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type) - - @func.func(memref_dyn_t, name="gpu_dealloc_" + suffix) - def dealloc_func(memref): - gpu.dealloc(None, [], memref) - - dealloc_func.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() - - -def emit_gpu_copy(suffix: str, element_type: ir.Type, rank: int = 2): - """Emit GPU copy function.""" - dyn = ir.ShapedType.get_dynamic_size() - memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type) - - @func.func(memref_dyn_t, memref_dyn_t, name="gpu_copy_" + suffix) - def copy_func(src, dst): - gpu.memcpy(None, [], dst, src) - - copy_func.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() - - -def emit_gpu_util_funcs(element_type: ir.Type): - """Emit GPU utility functions for allocation, deallocation and copy.""" - suffix = { - ir.F16Type.get(): "f16", - ir.F32Type.get(): "f32", - }[element_type] - emit_gpu_alloc(suffix, element_type) - emit_gpu_dealloc(suffix, element_type) - emit_gpu_copy(suffix, element_type) - - -def generate_matmul_payload( - func_name: str, - M: int, - N: int, - K: int, - ab_type_str: str, - c_type_str: str, - has_bias: bool, - has_relu: bool, -) -> ir.Module: - """Generate payload function module.""" - get_ir_dtype = { - "f16": ir.F16Type.get(), - "f32": ir.F32Type.get(), - } - ab_type = get_ir_dtype[ab_type_str] - c_type = get_ir_dtype[c_type_str] - tensor_a_t = ir.RankedTensorType.get((M, K), ab_type) - tensor_b_t = ir.RankedTensorType.get((K, N), ab_type) - tensor_c_t = ir.RankedTensorType.get((M, N), c_type) - tensor_bias_t = ir.RankedTensorType.get((N,), c_type) - memref_a_t = ir.MemRefType.get((M, K), ab_type) - memref_b_t = ir.MemRefType.get((K, N), ab_type) - memref_c_t = ir.MemRefType.get((M, N), c_type) - memref_bias_t = ir.MemRefType.get((N,), c_type) - mod = ir.Module.create() - with ir.InsertionPoint(mod.body): - fargs = [memref_a_t, memref_b_t] - if has_bias: - fargs.append(memref_bias_t) - fargs.append(memref_c_t) - - @func.func(*fargs, name=func_name) - def payload(*args): - A = args[0] - B = args[1] - C = args[-1] - a_tensor = bufferization.to_tensor(tensor_a_t, A, restrict=True) - b_tensor = bufferization.to_tensor(tensor_b_t, B, restrict=True) - c_tensor = bufferization.to_tensor( - tensor_c_t, C, restrict=True, writable=True - ) - - mmul = linalg.matmul(a_tensor, b_tensor, outs=[c_tensor]) - terminal = mmul - if has_bias: - bias = args[2] - bias_tensor = bufferization.to_tensor( - tensor_bias_t, bias, restrict=True, writable=True - ) - empty = tensor.empty((M, N), c_type) - bcast = linalg.broadcast(bias_tensor, outs=[empty], dimensions=[0]) - terminal = linalg.add(bcast, terminal, outs=[empty]) - if has_relu: - zero = arith.constant(c_type, 0.0) - empty = tensor.empty((M, N), c_type) - zero_tensor = linalg.fill(zero, outs=[empty]) - terminal = linalg.max(terminal, zero_tensor, outs=[empty]) - - bufferization.materialize_in_destination( - None, terminal, C, restrict=True, writable=True - ) - - payload.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() - - emit_gpu_util_funcs(ab_type) - if c_type != ab_type: - emit_gpu_util_funcs(c_type) - - return mod diff --git a/examples/xegpu_matmul/schedule.py b/examples/xegpu_matmul/schedule.py deleted file mode 100644 index b5827be..0000000 --- a/examples/xegpu_matmul/schedule.py +++ /dev/null @@ -1,373 +0,0 @@ -from mlir import ir -from mlir.dialects.transform import loop -from mlir.dialects.transform import bufferization -from mlir.dialects.transform import xegpu -from mlir.dialects.bufferization import LayoutMapOption -from mlir.dialects import transform -from mlir.dialects.transform import structured -from lighthouse.utils.mlir import ( - apply_registered_pass, - canonicalize, - match, -) -from typing import Optional - - -class PipelineInterrupt(Exception): - """Exception to signal early termination of the transform schedule.""" - - pass - - -# hardware constraints -dpas_tile = [8, 16, 16] -prefetch_inst_data = [8, 16] -nb_workitems = 16 # workitems in subgroup - - -def get_schedule_module( - has_bias: bool = False, - has_relu: bool = False, - stop_at_stage: str = "", - params: Optional[dict] = None, -) -> ir.Module: - """Generate transform schedule module.""" - mod = ir.Module.create() - mod.operation.attributes["transform.with_named_sequence"] = ir.UnitAttr.get() - with ir.InsertionPoint(mod.body): - named_sequence = transform.named_sequence( - "__transform_main", - [transform.AnyOpType.get()], # input types - [], # output types - arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}], - ) - with ir.InsertionPoint(named_sequence.body): - # match the payload module - anytype = transform.AnyOpType.get() - func = match(named_sequence.bodyTarget, ops={"func.func"}) - payload_mod = transform.get_parent_op( - anytype, - func, - op_name="builtin.module", - deduplicate=True, - ) - xegpu_matmul_transform_schedule( - payload_mod, - has_bias=has_bias, - has_relu=has_relu, - stop_at_stage=stop_at_stage, - params=params, - ) - - return mod - - -def xegpu_matmul_transform_schedule( - mod: ir.Value, - has_bias: bool = False, - has_relu: bool = False, - stop_at_stage: str = "", - params: Optional[dict] = None, -): - """Transform schedule for matmul-like payload.""" - try: - mod = bundle_xepu_matmul_schedule( - mod, - has_bias=has_bias, - has_relu=has_relu, - stop_at_stage=stop_at_stage, - params=params, - ) - - mod = bundle_xegpu_to_binary( - mod, - stop_at_stage=stop_at_stage, - ) - except PipelineInterrupt: - pass - finally: - transform.yield_() - - -def bundle_xepu_matmul_schedule( - mod, - has_bias: bool = False, - has_relu: bool = False, - stop_at_stage: str = "", - params: Optional[dict] = None, -) -> ir.Module: - """Schedule for lowering matmul-like payload to xegpu wg level.""" - if params is None: - raise ValueError("Schedule parameters must be provided.") - - # tunable parameters - wg_tile = [params["auto_wg_d0"], params["auto_wg_d1"]] - sg_tile = [params["auto_sg_d0"], params["auto_sg_d1"]] - k_tile = params["auto_k"] - - load_tile_a = [params["auto_load_a_d0"], params["auto_load_a_d1"]] - load_tile_b = [params["auto_load_b_d0"], params["auto_load_b_d1"]] - - prefetch_tile_a = [params["auto_prefetch_a_d0"], params["auto_prefetch_a_d1"]] - prefetch_tile_b = [params["auto_prefetch_b_d0"], params["auto_prefetch_b_d1"]] - nb_prefetch = params["auto_nb_prefetch"] - - # derived parameters - sg_layout = [wg_tile[0] // sg_tile[0], wg_tile[1] // sg_tile[1]] - # number of threads collapsed to 1d layout - nb_threads = sg_layout[0] * sg_layout[1] * nb_workitems - prefetch_layout_a = [ - wg_tile[0] // prefetch_tile_a[0], - k_tile // prefetch_tile_a[1], - ] - prefetch_layout_b = [ - k_tile // prefetch_tile_b[0], - wg_tile[1] // prefetch_tile_b[1], - ] - - # matmul matrix shapes - sg_tile_a = [sg_tile[0], k_tile] - sg_tile_b = [k_tile, sg_tile[1]] - - if stop_at_stage == "initial": - raise PipelineInterrupt() - - anytype = transform.AnyOpType.get() - anyvalue = transform.AnyValueType.get() - - # match the payload function - anchor = match(mod, ops={"linalg.matmul"}) - func = transform.get_parent_op( - anytype, - anchor, - op_name="func.func", - deduplicate=True, - ) - - dpas_shape_a = [dpas_tile[0], dpas_tile[2]] - dpas_shape_b = [dpas_tile[2], dpas_tile[1]] - dpas_shape_c = [dpas_tile[0], dpas_tile[1]] - - # wg tiling - if has_relu: - terminal = match(mod, ops={"linalg.max"}) - elif has_bias: - terminal = match(mod, ops={"linalg.add"}) - else: - terminal = match(mod, ops={"linalg.matmul"}) - # FIXME use structured.structured_fuse - structured.FuseOp(terminal, tile_sizes=wg_tile, use_forall=True) - transform.apply_cse(mod) - canonicalize(mod) - - # k loop tiling - wg_matmul = match(mod, ops={"linalg.matmul"}) - # FIXME use structured.structured_tile_using_for - wgk_matmul, k_loop = structured.TileUsingForOp( - wg_matmul, sizes=[0, 0, k_tile] - ).results - - transform.apply_cse(func) - canonicalize(func) - - if stop_at_stage == "tiled": - raise PipelineInterrupt() - - # vectorize - # FIXME use structured.structured_vectorize_children_and_apply_patterns - func = structured.VectorizeChildrenAndApplyPatternsOp( - func, - fold_type_extensions_into_contract=True, - ).result - - # hoist loop invariant vector read/store ops - k_loop = match(func, ops={"scf.for"}) - loop.HoistLoopInvariantSubsetsOp(k_loop) - - transform.apply_cse(func) - canonicalize(func) - - if stop_at_stage == "vectorized": - raise PipelineInterrupt() - - # bufferize - - # eliminate empty tensors to avoid emitting extra copy ops - mod = apply_registered_pass(mod, "eliminate-empty-tensors") - identity_layout = LayoutMapOption.IdentityLayoutMap - mod = bufferization.OneShotBufferizeOp( - mod, - allow_return_allocs_from_loops=True, - bufferize_function_boundaries=True, - function_boundary_type_conversion=identity_layout, - ).result - # fold memref.subviews into vector.transfer_read/write ops - mod = apply_registered_pass(mod, "fold-memref-alias-ops") - transform.apply_cse(mod) - canonicalize(mod) - - if stop_at_stage == "bufferized": - raise PipelineInterrupt() - - # convert forall to parallel - wg_loop = match(mod, ops={"scf.forall"}) - wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) - func = transform.get_parent_op(anytype, wg_loop) - - # convert to scf.parallel to gpu.launch - func = apply_registered_pass(func, "gpu-map-parallel-loops") - func = apply_registered_pass(func, "convert-parallel-loops-to-gpu") - func = apply_registered_pass(func, "lower-affine") - transform.apply_cse(func) - canonicalize(func) - - # set correct number of gpu threads - launch_op = match(func, ops={"gpu.launch"}) - xegpu.set_gpu_launch_threads(launch_op, threads=[nb_threads, 1, 1]) - - # outline gpu func - func = apply_registered_pass(func, "lower-affine") - canonicalize(func) - func = apply_registered_pass(func, "gpu-launch-sink-index-computations") - mod = apply_registered_pass(mod, "gpu-kernel-outlining") - transform.apply_cse(mod) - - # set xevm target - mod = apply_registered_pass( - mod, - "xevm-attach-target", - options={"O": "3", "chip": "bmg"}, - ) - - # convert vector to xegpu - gpu_mod = match(mod, ops={"gpu.module"}) - gpu_func = match(gpu_mod, ops={"gpu.func"}) - gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") - transform.apply_cse(gpu_func) - - if stop_at_stage == "xegpu-initial": - raise PipelineInterrupt() - - # add layouts to DPAS op operands - k_loop = match(gpu_func, ops={"scf.for"}) - dpas_op = match(k_loop, ops={"xegpu.dpas"}) - tile_a = transform.get_operand(anyvalue, dpas_op, [0]) - tile_b = transform.get_operand(anyvalue, dpas_op, [1]) - tile_c = transform.get_operand(anyvalue, dpas_op, [2]) - - def convert_layout(value, input, target): - xegpu.convert_layout( - value, - input_sg_layout=input["sg_layout"], - input_sg_data=input["sg_data"], - input_inst_data=input["inst_data"], - target_sg_layout=target["sg_layout"], - target_sg_data=target["sg_data"], - target_inst_data=target["inst_data"], - ) - - # insert prefetch ops for DPAS A and B tiles - desc_prefetch_a = xegpu.insert_prefetch( - tile_a, - nb_prefetch=nb_prefetch, - ) - layout_prefetch_a = { - "sg_layout": prefetch_layout_a, - "sg_data": prefetch_tile_a, - "inst_data": prefetch_inst_data, - } - pf_ops = transform.get_consumers_of_result(anytype, desc_prefetch_a, 0) - for pf in transform.split_handle((anytype,) * (nb_prefetch + 1), pf_ops): - xegpu.set_op_layout_attr(pf, **layout_prefetch_a) - - desc_prefetch_b = xegpu.insert_prefetch( - tile_b, - nb_prefetch=nb_prefetch, - ) - layout_prefetch_b = { - "sg_layout": prefetch_layout_b, - "sg_data": prefetch_tile_b, - "inst_data": prefetch_inst_data, - } - pf_ops = transform.get_consumers_of_result(anytype, desc_prefetch_b, 0) - for pf in transform.split_handle((anytype,) * (nb_prefetch + 1), pf_ops): - xegpu.set_op_layout_attr(pf, **layout_prefetch_b) - - # A tile load layout - layout_load_a = { - "sg_layout": sg_layout, - "sg_data": sg_tile_a, - "inst_data": load_tile_a, - } - desc_op_a = xegpu.get_desc_op(tile_a) - # A tile load op anchor layout - load_op_a = transform.get_consumers_of_result(anytype, desc_op_a, 0) - xegpu.set_op_layout_attr(load_op_a, **layout_load_a) - # A tile dpas layout - layout_dpas_a = layout_load_a.copy() - layout_dpas_a["inst_data"] = dpas_shape_a - convert_layout(tile_a, layout_load_a, layout_dpas_a) - - # B tile load layout - layout_load_b = { - "sg_layout": sg_layout, - "sg_data": sg_tile_b, - "inst_data": load_tile_b, - } - desc_op_b = xegpu.get_desc_op(tile_b) - # B tile load op anchor layout - load_op_b = transform.get_consumers_of_result(anytype, desc_op_b, 0) - xegpu.set_op_layout_attr(load_op_b, **layout_load_b) - # B tile dpas layout - layout_dpas_b = layout_load_b.copy() - layout_dpas_b["inst_data"] = dpas_shape_b - convert_layout(tile_b, layout_load_b, layout_dpas_b) - - # C tile layout - output_layout = { - "sg_layout": sg_layout, - "sg_data": sg_tile, - "inst_data": dpas_shape_c, - } - desc_op_c = xegpu.get_desc_op(tile_c) - # C tile load/store op anchor layout - desc_c_users = transform.get_consumers_of_result(anytype, desc_op_c, 0) - load_op_c, store_op_c = transform.split_handle((anytype, anytype), desc_c_users) - xegpu.set_op_layout_attr(load_op_c, **output_layout) - # C tile dpas anchor layout - xegpu.set_op_layout_attr(dpas_op, index=0, **layout_dpas_a) - xegpu.set_op_layout_attr(dpas_op, index=1, **layout_dpas_b) - xegpu.set_op_layout_attr(dpas_op, index=2, **output_layout) - - if has_bias: - # annotate the 1d load of the broadcast op with a slice layout - add_op = match(gpu_func, ops={"arith.addf"}) - bcast_op = transform.get_producer_of_operand(anytype, add_op, 0) - bcast_load = transform.get_producer_of_operand(anytype, bcast_op, 0) - xegpu.set_op_layout_attr( - bcast_load, result=True, index=0, **output_layout, slice_dims=[0] - ) - raise NotImplementedError("Bias layout propagation is not supported.") - transform.apply_cse(gpu_func) - canonicalize(gpu_func) - - # hoist desc ops out of reduction loop - transform.apply_licm(k_loop) - - canonicalize(gpu_func) - transform.apply_cse(gpu_func) - - if stop_at_stage == "xegpu-wg": - raise PipelineInterrupt() - - return mod - - -def bundle_xegpu_to_binary(mod, stop_at_stage: str = "") -> ir.Module: - """Schedule for lowering xegpu wg level to binary.""" - # upstream xegpu/xevm pipeline is payload independent. - mod = apply_registered_pass( - mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} - ) - - return mod diff --git a/examples/xegpu_mlp/README.md b/examples/xegpu_mlp/README.md new file mode 100644 index 0000000..6257240 --- /dev/null +++ b/examples/xegpu_mlp/README.md @@ -0,0 +1,46 @@ +# XeGPU Multilayer Perceptron (MLP) benchmark + +## Installation + +To install Lighthouse with XeGPU support, see installation instructions in [xegpu_matmul/README.md](../xegpu_matmul/README.md). + +## Usage + +Run the default single layer MLP (batch=1024, input_features=1024, output_features=1024) benchmark with correctness test: + +```bash +python mlp.py --check-result +``` + +which is equivalent to + +```bash +python mlp.py -b 1024 -i 1024 -o 1024 --check-result +``` + +Run a 3-layer MLP with batch size 128: + +```bash +python mlp.py -b 128 -i 16384 -o 8192 --hidden-sizes 16384 16384 ... +``` + +which corresponds to + +```txt +MLP with 3 layers + Layer 0: M=128, N=16384, K=16384 + Layer 1: M=128, N=16384, K=16384 + Layer 2: M=128, N=8192, K=16384 +``` + +Add ReLU to all layers: + +```bash +python mlp.py --relu ... +``` + +See all command line arguments: + +```bash +python mlp.py --help +``` diff --git a/examples/xegpu_mlp/mlp.py b/examples/xegpu_mlp/mlp.py new file mode 100644 index 0000000..d8671ce --- /dev/null +++ b/examples/xegpu_mlp/mlp.py @@ -0,0 +1,616 @@ +# RUN: %PYTHON %s --dump-kernel=xegpu-wg | FileCheck %s +# CHECK: module attributes {gpu.container_module} { + +""" +XeGPU MLP benchmark. +""" + +import argparse +import ctypes +from typing import Optional +from contextlib import contextmanager +from functools import cached_property +import warnings + +import numpy as np +from mlir import ir +from mlir.runtime.np_to_memref import ( + make_nd_memref_descriptor, + as_ctype, +) +from mlir.execution_engine import ExecutionEngine + +from lighthouse.workload import Workload, benchmark +from lighthouse.utils.memref import get_packed_arg, to_ctype as memref_to_ctype +from lighthouse.utils.numpy import numpy_to_ctype +from lighthouse.schedule.xegpu.matmul_schedule import get_schedule_module +from lighthouse.ingress.gpu import generate_mlp_payload + + +class XeGPUMLP(Workload): + """ + Multi-layer perceptron (MLP) workload on XeGPU. + + Optionally adds a ReLU operation after each layer. + Optionally adds a bias term in each layer (not implemented yet). + """ + + payload_function_name = "payload" + + def __init__( + self, + batch_size: int, + input_size: int, + output_size: int, + hidden_layer_sizes: Optional[list[int]] = None, + ab_type: str = "f16", + c_type: str = "f32", + has_bias: bool = False, + has_relu: bool = False, + accumulate_c: bool = False, + identity_weights: bool = False, + ): + self.batch_size = batch_size + self.input_size = input_size + self.output_size = output_size + self.hidden_layer_sizes = hidden_layer_sizes or [] + self.input_shape = (self.batch_size, self.input_size) + self.output_shape = (self.batch_size, self.output_size) + layer_sizes = [self.input_size] + self.hidden_layer_sizes + [self.output_size] + self.weight_shapes = list(zip(layer_sizes[:-1], layer_sizes[1:])) + self.matmul_layers = [(self.batch_size, o, i) for i, o in self.weight_shapes] + self.identity_weights = identity_weights + + assert ab_type == "f16", "Only f16 type is supported for A and B" + assert c_type == "f32", "Only f32 type is supported for C" + self.ab_type = ab_type + self.c_type = c_type + type_str_to_numpy = { + "f16": np.float16, + "f32": np.float32, + } + self.ab_dtype = type_str_to_numpy[ab_type] + self.c_dtype = type_str_to_numpy[c_type] + self.has_bias = has_bias + self.has_relu = has_relu + self.accumulate_c = accumulate_c + if has_bias: + raise NotImplementedError("Bias is not implemented yet") + + if len(self.matmul_layers) == 1 and self.has_relu: + warnings.warn("Using ReLU on a single layer model has no effect.") + + # cache allocated memrefs + self.gpu_memrefs = {} + + def _allocate_array( + self, + name: str, + shape: tuple[int, ...], + dtype_str: str, + execution_engine: ExecutionEngine, + ) -> ctypes.Structure: + key = (name, dtype_str) + if key in self.gpu_memrefs: + return self.gpu_memrefs[key] + dtype = { + "f16": np.float16, + "f32": np.float32, + }[dtype_str] + alloc_func = execution_engine.lookup("gpu_alloc_" + dtype_str) + mref = make_nd_memref_descriptor(len(shape), as_ctype(dtype))() + ptr_mref = ctypes.pointer(ctypes.pointer(mref)) + ptr_dims = [ctypes.pointer(ctypes.c_int32(d)) for d in shape] + alloc_func(get_packed_arg([ptr_mref] + ptr_dims)) + self.gpu_memrefs[key] = mref + return mref + + def _deallocate_all(self, execution_engine: ExecutionEngine): + for (_, dtype_str), mref in self.gpu_memrefs.items(): + dealloc_func = execution_engine.lookup("gpu_dealloc_" + dtype_str) + ptr_mref = ctypes.pointer(ctypes.pointer(mref)) + dealloc_func(get_packed_arg([ptr_mref])) + self.gpu_memrefs = {} + + @contextmanager + def allocate_inputs(self, execution_engine: ExecutionEngine): + try: + yield self._get_input_arrays(execution_engine) + finally: + self._deallocate_all(execution_engine) + + @cached_property + def _initial_host_arrays(self) -> list[np.ndarray]: + """Generate initial values on host with numpy.""" + + # use integer values to avoid f16/f32 floating point discrepancies + def gen_random(shape, dtype): + # generate values in range [-3, 3] + a = np.round(6 * np.random.random_sample(shape)) - 3 + return a.astype(dtype) + + def gen_identity(shape, dtype): + # identity matrix, if cols > rows wrap to fill all columns + a = np.zeros(shape, dtype=dtype) + np.fill_diagonal(a, 1) + if shape[1] > shape[0]: + second_block = a[:, shape[0] :] + np.fill_diagonal(second_block, 1) + return a + + np.random.seed(2) + input_array = gen_random(self.input_shape, self.ab_dtype) + output_array = np.zeros(self.output_shape, self.ab_dtype) + weights = [] + for i, o in self.weight_shapes: + if self.identity_weights: + W = gen_identity((i, o), self.ab_dtype) + else: + W = gen_random((i, o), self.ab_dtype) + weights.append(W) + + if self.has_bias: + raise NotImplementedError("Bias initialization not implemented") + + return input_array, output_array, *weights + + @cached_property + def _reference_solution(self) -> np.ndarray: + """Compute reference solution on host with numpy.""" + # NOTE for large problems the solution can overflow float16 range + host_arrays = self._initial_host_arrays + # use float32 data type for efficiency + host_arrays = [arr.astype(np.float32) for arr in host_arrays] + input_array = host_arrays[0] + output_array = host_arrays[1] + weights = host_arrays[2:] + + a_array = input_array + for i, W in enumerate(weights): + C_ref = a_array @ W + if self.has_relu and i < len(weights) - 1: + C_ref = np.maximum(C_ref, 0) + if self.has_bias: + raise NotImplementedError("Bias verification not implemented") + a_array = C_ref.astype(self.ab_dtype).astype(np.float32) + + C_ref += output_array + return C_ref.astype(self.ab_dtype) + + def _get_input_arrays( + self, execution_engine: ExecutionEngine + ) -> list[ctypes.Structure]: + if self.has_bias: + raise NotImplementedError("Bias allocation not implemented yet") + + # allocate arrays on device + input_gpu = self._allocate_array( + "input", self.input_shape, self.ab_type, execution_engine + ) + output_gpu = self._allocate_array( + "output", self.output_shape, self.ab_type, execution_engine + ) + gpu_arrays = [input_gpu, output_gpu] + for i, (in_size, out_size) in enumerate(self.weight_shapes): + W_gpu = self._allocate_array( + f"weight_{i}", (in_size, out_size), self.ab_type, execution_engine + ) + gpu_arrays.append(W_gpu) + + # get initial host arrays + host_arrays = self._initial_host_arrays + # copy initial values to device + copy_func_ab = execution_engine.lookup("gpu_copy_" + self.ab_type) + for host_arr, gpu_arr in zip(host_arrays, gpu_arrays): + copy_func_ab( + get_packed_arg([numpy_to_ctype(host_arr), memref_to_ctype(gpu_arr)]) + ) + + # return memrefs for the payload function + return gpu_arrays + + def check_correctness( + self, execution_engine: ExecutionEngine, verbose: int = 0 + ) -> bool: + # copy result from device to host + res_gpu = self.gpu_memrefs[("output", self.ab_type)] + res_host_copy = np.zeros(self.output_shape, dtype=self.ab_dtype) + copy_func = execution_engine.lookup("gpu_copy_" + self.ab_type) + copy_func( + get_packed_arg([memref_to_ctype(res_gpu), numpy_to_ctype(res_host_copy)]) + ) + + res_host_ref = self._reference_solution + res_host = res_host_copy + if verbose > 1: + print("Reference solution:") + print(res_host_ref) + print("Computed solution:") + print(res_host) + success = np.allclose(res_host, res_host_ref) + + if verbose: + if success: + print("PASSED") + else: + print("FAILED Result mismatch!") + print(f"Max absolute error: {np.max(np.abs(res_host - res_host_ref))}") + num_diff = np.sum(np.abs(res_host - res_host_ref) > 1e-3) + print(f"Number of differing elements: {num_diff}") + return success + + def get_complexity(self) -> tuple[int, int, int]: + nbytes_ab = np.dtype(self.ab_dtype).itemsize + nbytes_c = np.dtype(self.c_dtype).itemsize + + def matmul_complexity(M, N, K, has_bias, has_relu): + flop_count = 2 * M * N * K + memory_reads = (M * K + K * N) * nbytes_ab # read A and B + memory_writes = M * N * nbytes_c # write C + if has_bias: + flop_count += M * N + memory_reads += N * nbytes_c # read bias + if has_relu: + flop_count += M * N + return flop_count, memory_reads, memory_writes + + flop_count = 0 + memory_reads = 0 + memory_writes = 0 + for i, (M, N, K) in enumerate(self.matmul_layers): + relu = self.has_relu if i < len(self.matmul_layers) - 1 else False + f, r, w = matmul_complexity(M, N, K, self.has_bias, relu) + flop_count += f + memory_reads += r + memory_writes += w + return (flop_count, memory_reads, memory_writes) + + def payload_module(self) -> ir.Module: + mod = generate_mlp_payload( + func_name=self.payload_function_name, + batch_size=self.batch_size, + input_size=self.input_size, + output_size=self.output_size, + hidden_layer_sizes=self.hidden_layer_sizes, + ab_type_str=self.ab_type, + c_type_str=self.c_type, + has_bias=self.has_bias, + has_relu=self.has_relu, + accumulate_c=self.accumulate_c, + ) + return mod + + def schedule_module( + self, stop_at_stage: Optional[str] = None, parameters: Optional[dict] = None + ) -> ir.Module: + return get_schedule_module( + has_bias=self.has_bias, + has_relu=self.has_relu, + accumulate_c=self.accumulate_c, + stop_at_stage=stop_at_stage, + nlayers=len(self.matmul_layers), + params=parameters, + ) + + def shared_libs(self) -> list[str]: + return ["libmlir_levelzero_runtime.so"] + + +matmul_param_db = { + (4096, 4096, 4096): { + "wg_m": 256, + "wg_n": 256, + "sg_m": 32, + "sg_n": 32, + "k": 64, + "load_a_m": 8, + "load_a_k": 16, + "load_b_k": 16, + "load_b_n": 16, + "pf_a_m": 8, + "pf_a_k": 32, + "pf_b_k": 8, + "pf_b_n": 32, + "pf_nb": 1, + }, + (128, 16384, 16384): { + "wg_m": 128, + "wg_n": 256, + "sg_m": 32, + "sg_n": 32, + "k": 256, + "load_a_m": 8, + "load_a_k": 16, + "load_b_k": 32, + "load_b_n": 16, + "pf_a_m": 8, + "pf_a_k": 16, + "pf_b_k": 8, + "pf_b_n": 16, + "pf_nb": 1, + }, + (128, 8192, 16384): { + "wg_m": 64, + "wg_n": 128, + "sg_m": 32, + "sg_n": 32, + "k": 128, + "load_a_m": 16, + "load_a_k": 16, + "load_b_k": 16, + "load_b_n": 16, + "pf_a_m": 32, + "pf_a_k": 16, + "pf_b_k": 16, + "pf_b_n": 32, + "pf_nb": 1, + }, + (128, 32768, 16384): { + "wg_m": 128, + "wg_n": 128, + "sg_m": 32, + "sg_n": 32, + "k": 256, + "load_a_m": 8, + "load_a_k": 16, + "load_b_k": 16, + "load_b_n": 16, + "pf_a_m": 16, + "pf_a_k": 32, + "pf_b_k": 8, + "pf_b_n": 32, + "pf_nb": 1, + }, + (128, 16384, 32768): { + "wg_m": 128, + "wg_n": 128, + "sg_m": 32, + "sg_n": 32, + "k": 256, + "load_a_m": 8, + "load_a_k": 16, + "load_b_k": 16, + "load_b_n": 16, + "pf_a_m": 32, + "pf_a_k": 32, + "pf_b_k": 8, + "pf_b_n": 16, + "pf_nb": 1, + }, + (128, 32768, 32768): { + "wg_m": 128, + "wg_n": 256, + "sg_m": 32, + "sg_n": 32, + "k": 256, + "load_a_m": 8, + "load_a_k": 16, + "load_b_k": 16, + "load_b_n": 16, + "pf_a_m": 16, + "pf_a_k": 32, + "pf_b_k": 32, + "pf_b_n": 32, + "pf_nb": 1, + }, + (1024, 1024, 8192): { + "wg_m": 256, + "wg_n": 128, + "sg_m": 32, + "sg_n": 32, + "k": 32, + "load_a_m": 8, + "load_a_k": 16, + "load_b_k": 32, + "load_b_n": 16, + "pf_a_m": 8, + "pf_a_k": 16, + "pf_b_k": 8, + "pf_b_n": 16, + "pf_nb": 1, + }, + (1024, 8192, 1024): { + "wg_m": 256, + "wg_n": 128, + "sg_m": 32, + "sg_n": 32, + "k": 32, + "load_a_m": 16, + "load_a_k": 16, + "load_b_k": 32, + "load_b_n": 16, + "pf_a_m": 8, + "pf_a_k": 16, + "pf_b_k": 16, + "pf_b_n": 16, + "pf_nb": 1, + }, + (1024, 1024, 1024): { + "wg_m": 128, + "wg_n": 64, + "sg_m": 32, + "sg_n": 32, + "k": 32, + "load_a_m": 16, + "load_a_k": 16, + "load_b_k": 32, + "load_b_n": 16, + "pf_a_m": 8, + "pf_a_k": 32, + "pf_b_k": 8, + "pf_b_n": 16, + "pf_nb": 1, + }, +} + + +class ParameterOracleMLP: + def __init__(self, workload: XeGPUMLP): + self.param_db = matmul_param_db + self.workload = workload + + def get_parameters(self) -> dict[str, dict]: + parameters = {} + for i, shape in enumerate(self.workload.matmul_layers): + if shape in self.param_db: + params = self.param_db[shape] + else: + raise ValueError(f"No parameters found for matmul shape {shape}") + parameters[f"layer_{i}"] = params + return parameters + + +def parse_cli(): + parser = argparse.ArgumentParser( + description="Matrix Multiplication using MLIR", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "-b", + "--batch-size", + type=int, + default=1024, + help="Batch size M. Input matrix has shape (M x K).", + ) + parser.add_argument( + "-i", + "--input-size", + type=int, + default=1024, + help="Number of input features K. Input matrix has shape (M x K).", + ) + parser.add_argument( + "-o", + "--output-size", + type=int, + default=1024, + help="Number of output features N. Output matrix has shape (M x N).", + ) + parser.add_argument( + "--hidden-sizes", + type=int, + nargs="+", + help="Number of features in each hidden layers.", + ) + parser.add_argument( + "--nruns", + type=int, + default=1000, + help="Number of runs to average the execution time.", + ) + parser.add_argument( + "--nwarmup", + type=int, + default=20, + help="Number of warm-up iterations before benchmarking.", + ) + parser.add_argument( + "--relu", + action="store_true", + help="Add ReLU activation function to each layer except the output layer.", + ) + parser.add_argument( + "--accumulate-c", + action="store_true", + help="Use matrix-multiply-accumulate layers instead of initializing the " + "accumulator tile with zeros.", + ) + parser.add_argument( + "--check-result", + action="store_true", + help="Check the result of the MLP model. If the result overflows to " + "inf/nan values, use --identity-weights option.", + ) + parser.add_argument( + "--identity-weights", + action="store_true", + help="Initialize weights as (extended) identity matrix, useful for " + "correctness test. Can skew performance measurement.", + ) + parser.add_argument( + "--dump-kernel", + type=str, + choices=[ + "initial", + "tiled", + "vectorized", + "bufferized", + "xegpu-initial", + "xegpu-wg", + "xegpu-sg", + "xegpu-inst", + "final", + ], + help="Dump kernel IR at different stages of lowering.", + ) + parser.add_argument( + "--dump-schedule", + action="store_true", + help="Dump transform schedule.", + ) + args = parser.parse_args() + + return args + + +if __name__ == "__main__": + args = parse_cli() + + ab_type = "f16" + c_type = "f32" + + with ir.Context(), ir.Location.unknown(): + wload = XeGPUMLP( + batch_size=args.batch_size, + input_size=args.input_size, + output_size=args.output_size, + hidden_layer_sizes=args.hidden_sizes, + ab_type=ab_type, + c_type=c_type, + has_bias=False, + has_relu=args.relu, + accumulate_c=args.accumulate_c, + identity_weights=args.identity_weights, + ) + matmuls = wload.matmul_layers + print(f"MLP with {len(matmuls)} layers") + for i, (M, N, K) in enumerate(matmuls): + print(f" Layer {i}: M={M}, N={N}, K={K}") + + param_oracle = ParameterOracleMLP(wload) + params = param_oracle.get_parameters() + + if args.dump_kernel or args.dump_schedule: + wload.lower_payload( + dump_payload=args.dump_kernel, + dump_schedule=args.dump_schedule, + schedule_parameters=params, + ) + else: + times = benchmark( + wload, + nruns=args.nruns, + nwarmup=args.nwarmup, + schedule_parameters=params, + check_correctness=args.check_result, + verbose=2, + ) + times *= 1e6 # convert to microseconds + elapsed = np.mean(times) + flop_count = wload.get_complexity()[0] + gflops = flop_count / (elapsed * 1e-6) / 1e9 + + def list2str(a): + return ",".join(map(str, a)) + + hidden_sizes = args.hidden_sizes if args.hidden_sizes else [] + parts = [ + f"b={args.batch_size}", + f"i={args.input_size}", + f"o={args.output_size}", + f"hs={list2str(hidden_sizes)}", + f"dt={ab_type},{c_type}", + f"time(us): {elapsed:.2f}", + f"GFLOPS: {gflops:.2f}", + ] + print(" ".join(parts)) diff --git a/lighthouse/ingress/gpu/__init__.py b/lighthouse/ingress/gpu/__init__.py new file mode 100644 index 0000000..af910e2 --- /dev/null +++ b/lighthouse/ingress/gpu/__init__.py @@ -0,0 +1,3 @@ +from .matmul import generate_matmul_payload, generate_mlp_payload + +__all__ = ["generate_matmul_payload", "generate_mlp_payload"] diff --git a/lighthouse/ingress/gpu/matmul.py b/lighthouse/ingress/gpu/matmul.py new file mode 100644 index 0000000..f100dfe --- /dev/null +++ b/lighthouse/ingress/gpu/matmul.py @@ -0,0 +1,315 @@ +from mlir import ir +from mlir.dialects import func, linalg, gpu, bufferization, arith, tensor + + +def emit_gpu_alloc(suffix: str, element_type: ir.Type, rank: int = 2): + dyn = ir.ShapedType.get_dynamic_size() + memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type) + index_t = ir.IndexType.get() + i32_t = ir.IntegerType.get_signless(32) + inputs = rank * (i32_t,) + + @func.func(*inputs, name="gpu_alloc_" + suffix) + def alloc_func(*shape): + dims = [arith.index_cast(index_t, a) for a in shape] + alloc = gpu.alloc(memref_dyn_t, None, [], dims, []) + return alloc + + alloc_func.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() + + +def emit_gpu_dealloc(suffix: str, element_type: ir.Type, rank: int = 2): + dyn = ir.ShapedType.get_dynamic_size() + memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type) + + @func.func(memref_dyn_t, name="gpu_dealloc_" + suffix) + def dealloc_func(memref): + gpu.dealloc(None, [], memref) + + dealloc_func.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() + + +def emit_gpu_copy(suffix: str, element_type: ir.Type, rank: int = 2): + """Emit GPU copy function.""" + dyn = ir.ShapedType.get_dynamic_size() + memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type) + + @func.func(memref_dyn_t, memref_dyn_t, name="gpu_copy_" + suffix) + def copy_func(src, dst): + gpu.memcpy(None, [], dst, src) + + copy_func.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() + + +def emit_gpu_util_funcs(element_type: ir.Type): + """Emit GPU utility functions for allocation, deallocation and copy.""" + suffix = { + ir.F16Type.get(): "f16", + ir.F32Type.get(): "f32", + }[element_type] + emit_gpu_alloc(suffix, element_type) + emit_gpu_dealloc(suffix, element_type) + emit_gpu_copy(suffix, element_type) + + +def emit_mlp_layer( + a_tensor, + b_tensor, + c_tensor, + ab_type, + c_type, + bias_tensor=None, + has_relu=False, + accumulate_c=True, + convert_c_type=False, +) -> ir.Value: + M, N = c_tensor.type.shape + id_map = ir.AffineMap.get_identity(2) + par_iter = linalg.IteratorType.parallel + if convert_c_type and accumulate_c: + empty = tensor.empty((M, N), c_type) + + @linalg.generic( + [c_tensor], + [empty], + [id_map, id_map], + [par_iter, par_iter], + ) + def f(a, b): + return arith.extf(c_type, a) + + input_c_tensor = f + else: + if accumulate_c: + input_c_tensor = c_tensor + else: + zero = arith.constant(c_type, 0.0) + empty = tensor.empty((M, N), c_type) + zero_tensor = linalg.fill(zero, outs=[empty]) + input_c_tensor = zero_tensor + mmul = linalg.matmul(a_tensor, b_tensor, outs=[input_c_tensor]) + terminal = mmul + res_type = c_type + if convert_c_type: + res_type = ab_type + empty = tensor.empty((M, N), ab_type) + + @linalg.generic( + [terminal], + [empty], + [id_map, id_map], + [par_iter, par_iter], + ) + def f(a, b): + return arith.truncf(ab_type, a) + + terminal = f + if bias_tensor is not None: + empty = tensor.empty((M, N), res_type) + bcast = linalg.broadcast(bias_tensor, outs=[empty], dimensions=[0]) + terminal = linalg.add(bcast, terminal, outs=[empty]) + if has_relu: + zero = arith.constant(ab_type if convert_c_type else c_type, 0.0) + empty = tensor.empty((M, N), res_type) + zero_tensor = linalg.fill(zero, outs=[empty]) + terminal = linalg.max(terminal, zero_tensor, outs=[empty]) + + return terminal + + +def generate_matmul_payload( + func_name: str, + M: int, + N: int, + K: int, + ab_type_str: str, + c_type_str: str, + has_bias: bool, + has_relu: bool, + accumulate_c: bool, +) -> ir.Module: + """Generate payload function module.""" + get_ir_dtype = { + "f16": ir.F16Type.get(), + "f32": ir.F32Type.get(), + } + ab_type = get_ir_dtype[ab_type_str] + c_type = get_ir_dtype[c_type_str] + tensor_a_t = ir.RankedTensorType.get((M, K), ab_type) + tensor_b_t = ir.RankedTensorType.get((K, N), ab_type) + tensor_c_t = ir.RankedTensorType.get((M, N), c_type) + memref_a_t = ir.MemRefType.get((M, K), ab_type) + memref_b_t = ir.MemRefType.get((K, N), ab_type) + memref_c_t = ir.MemRefType.get((M, N), c_type) + memref_bias_t = ir.MemRefType.get((N,), c_type) + mod = ir.Module.create() + with ir.InsertionPoint(mod.body): + fargs = [memref_a_t, memref_b_t] + if has_bias: + fargs.append(memref_bias_t) + fargs.append(memref_c_t) + + @func.func(*fargs, name=func_name) + def payload(*args): + A = args[0] + B = args[1] + C = args[-1] + bias = args[2] if has_bias else None + a_tensor = bufferization.to_tensor(tensor_a_t, A, restrict=True) + b_tensor = bufferization.to_tensor(tensor_b_t, B, restrict=True) + c_tensor = bufferization.to_tensor( + tensor_c_t, C, restrict=True, writable=True + ) + if has_bias: + bias_tensor = bufferization.to_tensor( + ir.RankedTensorType.get((N,), c_type), bias, restrict=True + ) + else: + bias_tensor = None + + output = emit_mlp_layer( + a_tensor, + b_tensor, + c_tensor, + ab_type, + c_type, + bias_tensor, + has_relu, + accumulate_c=accumulate_c, + convert_c_type=False, + ) + bufferization.materialize_in_destination( + None, output, C, restrict=True, writable=True + ) + + payload.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() + + emit_gpu_util_funcs(ab_type) + if c_type != ab_type: + emit_gpu_util_funcs(c_type) + + return mod + + +def emit_buf_to_tensor(memref_value: ir.Value, **kwargs) -> ir.Value: + memref_type = memref_value.type + shape = memref_type.shape + element_type = memref_type.element_type + tensor_type = ir.RankedTensorType.get(shape, element_type) + return bufferization.to_tensor(tensor_type, memref_value, **kwargs) + + +def generate_mlp_payload( + func_name: str, + batch_size: int, + input_size: int, + output_size: int, + hidden_layer_sizes: list[int], + ab_type_str: str, + c_type_str: str, + has_bias: bool, + has_relu: bool, + accumulate_c: bool, +) -> ir.Module: + """Generate payload function module.""" + get_ir_dtype = { + "f16": ir.F16Type.get(), + "f32": ir.F32Type.get(), + } + ab_type = get_ir_dtype[ab_type_str] + c_type = get_ir_dtype[c_type_str] + mod = ir.Module.create() + memref_in_t = ir.MemRefType.get((batch_size, input_size), ab_type) + memref_out_t = ir.MemRefType.get((batch_size, output_size), ab_type) + layer_sizes = [input_size] + hidden_layer_sizes + [output_size] + feature_sizes = list(zip(layer_sizes[:-1], layer_sizes[1:])) + weight_memref_types = [] + bias_memref_types = [] + for in_size, out_size in feature_sizes: + memref_t = ir.MemRefType.get((in_size, out_size), ab_type) + weight_memref_types.append(memref_t) + if has_bias: + memref_t = ir.MemRefType.get((out_size,), c_type) + bias_memref_types.append(memref_t) + with ir.InsertionPoint(mod.body): + # function argument order: + # input, output, weights_0, weights_1, ..., [bias_0, bias_1, ...] + fargs = [memref_in_t, memref_out_t] + weight_memref_types + if has_bias: + fargs += bias_memref_types + + @func.func(*fargs, name=func_name) + def payload(*args): + input = args[0] + output = args[1] + nlayers = len(hidden_layer_sizes) + 1 + weights = args[2 : 2 + nlayers] + biases = args[2 + nlayers :] if has_bias else [None] * nlayers + input_tensor = emit_buf_to_tensor(input, restrict=True) + output_tensor = emit_buf_to_tensor(output, restrict=True) + weight_tensors = [] + for weight_memref in weights: + weight_tensor = emit_buf_to_tensor(weight_memref, restrict=True) + weight_tensors.append(weight_tensor) + bias_tensors = [] + for bias_memref in biases: + if has_bias: + bias_tensor = emit_buf_to_tensor(bias_memref, restrict=True) + else: + bias_tensor = None + bias_tensors.append(bias_tensor) + + layer_output = input_tensor + to_dealloc = None + for i, (weight, bias) in enumerate(zip(weight_tensors, bias_tensors)): + a_tensor = layer_output + b_tensor = weight + M, K = a_tensor.type.shape + _, N = b_tensor.type.shape + if i == nlayers - 1: + c_tensor = output_tensor + else: + # allocate intermediate buffer + memref_type = ir.MemRefType.get((M, N), ab_type) + c_memref = gpu.alloc(memref_type, None, [], [], []) + gpu.memset(None, [], c_memref, arith.constant(ab_type, 0.0)) + c_tensor = emit_buf_to_tensor( + c_memref, restrict=True, writable=True + ) + bias_tensor = bias + # skip relu for final layer + emit_relu = has_relu if i < nlayers - 1 else False + layer_output = emit_mlp_layer( + a_tensor, + b_tensor, + c_tensor, + ab_type, + c_type, + bias_tensor, + emit_relu, + accumulate_c=accumulate_c, + convert_c_type=True, + ) + if i != nlayers - 1: + bufferization.materialize_in_destination( + None, layer_output, c_memref, restrict=True, writable=True + ) + if to_dealloc is not None: + gpu.dealloc(None, [], to_dealloc) + to_dealloc = None + if i != nlayers - 1: + # deallocate after next layer + to_dealloc = c_memref + + # finalize + bufferization.materialize_in_destination( + None, layer_output, output, restrict=True, writable=True + ) + + payload.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() + + emit_gpu_util_funcs(ab_type) + if c_type != ab_type: + emit_gpu_util_funcs(c_type) + + return mod diff --git a/lighthouse/schedule/__init__.py b/lighthouse/schedule/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lighthouse/schedule/xegpu/__init__.py b/lighthouse/schedule/xegpu/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lighthouse/schedule/xegpu/matmul_schedule.py b/lighthouse/schedule/xegpu/matmul_schedule.py new file mode 100644 index 0000000..77ec013 --- /dev/null +++ b/lighthouse/schedule/xegpu/matmul_schedule.py @@ -0,0 +1,433 @@ +from mlir import ir +from mlir.dialects.transform import loop +from mlir.dialects.transform import bufferization +from mlir.dialects.transform import xegpu +from mlir.dialects.bufferization import LayoutMapOption +from mlir.dialects import transform +from mlir.dialects.transform import structured +from lighthouse.utils.mlir import ( + apply_registered_pass, + canonicalize, + match, +) +from typing import Optional + + +class PipelineInterrupt(Exception): + """Exception to signal early termination of the transform schedule.""" + + pass + + +def match_and_split(*args, nhandles=1, **kwargs): + """Henper function that splits matched handles.""" + matched = match(*args, **kwargs) + anytype = transform.AnyOpType.get() + matched_ops = transform.split_handle((anytype,) * nhandles, matched) + if nhandles == 1: + matched_ops = [matched_ops] + return matched_ops + + +# hardware constraints +dpas_tile = [8, 16, 16] +prefetch_inst_data = [8, 16] +nb_workitems = 16 # workitems in subgroup + + +def get_schedule_module( + has_bias: bool = False, + has_relu: bool = False, + has_convert_c: bool = True, + accumulate_c: bool = False, + stop_at_stage: str = "", + nlayers: int = 1, + params: Optional[dict] = None, +) -> ir.Module: + """Generate transform schedule module.""" + mod = ir.Module.create() + mod.operation.attributes["transform.with_named_sequence"] = ir.UnitAttr.get() + with ir.InsertionPoint(mod.body): + named_sequence = transform.named_sequence( + "__transform_main", + [transform.AnyOpType.get()], # input types + [], # output types + arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}], + ) + with ir.InsertionPoint(named_sequence.body): + # match the payload module + anytype = transform.AnyOpType.get() + func = match(named_sequence.bodyTarget, ops={"func.func"}) + payload_mod = transform.get_parent_op( + anytype, + func, + op_name="builtin.module", + deduplicate=True, + ) + xegpu_mlp_transform_schedule( + payload_mod, + has_bias=has_bias, + has_relu=has_relu, + has_convert_c=has_convert_c, + accumulate_c=accumulate_c, + stop_at_stage=stop_at_stage, + nlayers=nlayers, + params=params, + ) + + return mod + + +def xegpu_mlp_transform_schedule( + mod: ir.Value, + has_bias: bool = False, + has_relu: bool = False, + accumulate_c: bool = False, + has_convert_c: bool = True, + stop_at_stage: str = "", + nlayers: int = 1, + params: Optional[list[dict]] = None, +): + """Transform schedule for matmul-like payload.""" + try: + mod = bundle_xepu_mlp_schedule( + mod, + has_bias=has_bias, + has_relu=has_relu, + accumulate_c=accumulate_c, + has_convert_c=has_convert_c, + stop_at_stage=stop_at_stage, + nlayers=nlayers, + params=params, + ) + + mod = bundle_xegpu_to_binary( + mod, + stop_at_stage=stop_at_stage, + ) + except PipelineInterrupt: + pass + finally: + transform.yield_() + + +def bundle_xepu_mlp_schedule( + mod: ir.Value, + has_bias: bool = False, + has_relu: bool = False, + accumulate_c: bool = False, + has_convert_c: bool = True, + stop_at_stage: str = "", + nlayers: int = 1, + params: Optional[list[dict]] = None, +) -> ir.Module: + """Schedule for lowering matmul-like payload to xegpu wg level.""" + if params is None: + raise ValueError("Schedule parameters must be provided.") + + if stop_at_stage == "initial": + raise PipelineInterrupt() + + anytype = transform.AnyOpType.get() + anyvalue = transform.AnyValueType.get() + + for i in range(nlayers): + assert f"layer_{i}" in params, f"Missing parameters for 'layer_{i}'" + + dpas_shape_a = [dpas_tile[0], dpas_tile[2]] + dpas_shape_b = [dpas_tile[2], dpas_tile[1]] + dpas_shape_c = [dpas_tile[0], dpas_tile[1]] + + # wg tiling + if has_convert_c: + trunc_op = match(mod, ops={"arith.truncf"}) + terminal = transform.get_parent_op(anytype, trunc_op) + # split handle for each layer + terminal_ops = transform.split_handle((anytype,) * nlayers, terminal) + if nlayers == 1: + terminal_ops = [terminal_ops] + elif has_bias: + terminal_ops = match_and_split(mod, ops={"linalg.add"}, nhandles=nlayers) + else: + terminal_ops = match_and_split(mod, ops={"linalg.matmul"}, nhandles=nlayers) + if has_relu and nlayers > 1: + # intermediate layers have relu activation function + relu_ops = match_and_split(mod, ops={"linalg.max"}, nhandles=nlayers - 1) + # the final layer does not have relu + terminal_ops = list(relu_ops) + [terminal_ops[-1]] + + # tile each layer separately + for i_layer in range(nlayers): + layer_params = params[f"layer_{i_layer}"] + # tunable parameters: wg level tiling + wg_tile = [layer_params["wg_m"], layer_params["wg_n"]] + sg_tile = [layer_params["sg_m"], layer_params["sg_n"]] + k_tile = layer_params["k"] + + terminal = terminal_ops[i_layer] + # FIXME use structured.structured_fuse + _, wg_loop = structured.FuseOp( + terminal, tile_sizes=wg_tile, use_forall=True + ).results + transform.apply_cse(mod) + canonicalize(mod) + + # k loop tiling + wg_matmul = match(wg_loop, ops={"linalg.matmul"}) + # FIXME use structured.structured_tile_using_for + wgk_matmul, k_loop = structured.TileUsingForOp( + wg_matmul, sizes=[0, 0, k_tile] + ).results + + func = transform.get_parent_op( + anytype, + k_loop, + op_name="func.func", + deduplicate=True, + ) + transform.apply_cse(func) + canonicalize(func) + + if stop_at_stage == "tiled": + raise PipelineInterrupt() + + # vectorize + # FIXME use structured.structured_vectorize_children_and_apply_patterns + func = structured.VectorizeChildrenAndApplyPatternsOp( + func, + fold_type_extensions_into_contract=True, + ).result + + # hoist loop invariant vector read/store ops + k_loop = match(func, ops={"scf.for"}) + loop.HoistLoopInvariantSubsetsOp(k_loop) + + transform.apply_cse(func) + canonicalize(func) + + if stop_at_stage == "vectorized": + raise PipelineInterrupt() + + # bufferize + + # eliminate empty tensors to avoid emitting extra copy ops + mod = apply_registered_pass(mod, "eliminate-empty-tensors") + identity_layout = LayoutMapOption.IdentityLayoutMap + mod = bufferization.OneShotBufferizeOp( + mod, + allow_return_allocs_from_loops=True, + bufferize_function_boundaries=True, + function_boundary_type_conversion=identity_layout, + ).result + # fold memref.subviews into vector.transfer_read/write ops + mod = apply_registered_pass(mod, "fold-memref-alias-ops") + transform.apply_cse(mod) + canonicalize(mod) + + if stop_at_stage == "bufferized": + raise PipelineInterrupt() + + # convert forall to parallel + wg_loops = match_and_split(mod, ops={"scf.forall"}, nhandles=nlayers) + for wg_loop in wg_loops: + wg_loop = loop.loop_forall_to_parallel([anytype], wg_loop) + func = transform.get_parent_op(anytype, wg_loop) + + # convert to scf.parallel to gpu.launch + func = apply_registered_pass(func, "gpu-map-parallel-loops") + func = apply_registered_pass(func, "convert-parallel-loops-to-gpu") + func = apply_registered_pass(func, "lower-affine") + transform.apply_cse(func) + canonicalize(func) + + # set correct number of gpu threads + launch_ops = match_and_split(mod, ops={"gpu.launch"}, nhandles=nlayers) + for i_layer, launch_op in enumerate(launch_ops): + layer_params = params[f"layer_{i_layer}"] + # tunable parameters + wg_tile = [layer_params["wg_m"], layer_params["wg_n"]] + sg_tile = [layer_params["sg_m"], layer_params["sg_n"]] + + # derived parameters + sg_layout = [wg_tile[0] // sg_tile[0], wg_tile[1] // sg_tile[1]] + # number of threads collapsed to 1d layout + nb_threads = sg_layout[0] * sg_layout[1] * nb_workitems + + xegpu.set_gpu_launch_threads(launch_op, threads=[nb_threads, 1, 1]) + + # outline gpu func + func = apply_registered_pass(func, "lower-affine") + canonicalize(func) + func = apply_registered_pass(func, "gpu-launch-sink-index-computations") + mod = apply_registered_pass(mod, "gpu-kernel-outlining") + transform.apply_cse(mod) + + # set xevm target + mod = apply_registered_pass( + mod, + "xevm-attach-target", + options={"O": "3", "chip": "bmg"}, + ) + + # convert vector to xegpu + gpu_mod_ops = match_and_split(mod, ops={"gpu.module"}, nhandles=nlayers) + for gpu_mod in gpu_mod_ops: + gpu_func = match(gpu_mod, ops={"gpu.func"}) + gpu_func = apply_registered_pass(gpu_func, "convert-vector-to-xegpu") + transform.apply_cse(gpu_func) + + if stop_at_stage == "xegpu-initial": + raise PipelineInterrupt() + + for i_layer, gpu_mod in enumerate(gpu_mod_ops): + gpu_func = match(gpu_mod, ops={"gpu.func"}) + + # tunable parameters: xegpu layout + layer_params = params[f"layer_{i_layer}"] + + wg_tile = [layer_params["wg_m"], layer_params["wg_n"]] + sg_tile = [layer_params["sg_m"], layer_params["sg_n"]] + k_tile = layer_params["k"] + + sg_layout = [wg_tile[0] // sg_tile[0], wg_tile[1] // sg_tile[1]] + + load_tile_a = [layer_params["load_a_m"], layer_params["load_a_k"]] + load_tile_b = [layer_params["load_b_k"], layer_params["load_b_n"]] + prefetch_tile_a = [layer_params["pf_a_m"], layer_params["pf_a_k"]] + prefetch_tile_b = [layer_params["pf_b_k"], layer_params["pf_b_n"]] + nb_prefetch = layer_params["pf_nb"] + + prefetch_layout_a = [ + wg_tile[0] // prefetch_tile_a[0], + k_tile // prefetch_tile_a[1], + ] + prefetch_layout_b = [ + k_tile // prefetch_tile_b[0], + wg_tile[1] // prefetch_tile_b[1], + ] + + # matmul matrix shapes + sg_tile_a = [sg_tile[0], k_tile] + sg_tile_b = [k_tile, sg_tile[1]] + + # add layouts to DPAS op operands + k_loop = match(gpu_func, ops={"scf.for"}) + dpas_op = match(k_loop, ops={"xegpu.dpas"}) + tile_a = transform.get_operand(anyvalue, dpas_op, [0]) + tile_b = transform.get_operand(anyvalue, dpas_op, [1]) + + def convert_layout(value, input, target): + xegpu.convert_layout( + value, + input_sg_layout=input["sg_layout"], + input_sg_data=input["sg_data"], + input_inst_data=input["inst_data"], + target_sg_layout=target["sg_layout"], + target_sg_data=target["sg_data"], + target_inst_data=target["inst_data"], + ) + + # insert prefetch ops for DPAS A and B tiles + desc_prefetch_a = xegpu.insert_prefetch( + tile_a, + nb_prefetch=nb_prefetch, + ) + layout_prefetch_a = { + "sg_layout": prefetch_layout_a, + "sg_data": prefetch_tile_a, + "inst_data": prefetch_inst_data, + } + pf_ops = transform.get_consumers_of_result(anytype, desc_prefetch_a, 0) + for pf in transform.split_handle((anytype,) * (nb_prefetch + 1), pf_ops): + xegpu.set_op_layout_attr(pf, **layout_prefetch_a) + + desc_prefetch_b = xegpu.insert_prefetch( + tile_b, + nb_prefetch=nb_prefetch, + ) + layout_prefetch_b = { + "sg_layout": prefetch_layout_b, + "sg_data": prefetch_tile_b, + "inst_data": prefetch_inst_data, + } + pf_ops = transform.get_consumers_of_result(anytype, desc_prefetch_b, 0) + for pf in transform.split_handle((anytype,) * (nb_prefetch + 1), pf_ops): + xegpu.set_op_layout_attr(pf, **layout_prefetch_b) + + # A tile load layout + layout_load_a = { + "sg_layout": sg_layout, + "sg_data": sg_tile_a, + "inst_data": load_tile_a, + } + desc_op_a = xegpu.get_desc_op(tile_a) + # A tile load op anchor layout + load_op_a = transform.get_consumers_of_result(anytype, desc_op_a, 0) + xegpu.set_op_layout_attr(load_op_a, **layout_load_a) + # A tile dpas layout + layout_dpas_a = layout_load_a.copy() + layout_dpas_a["inst_data"] = dpas_shape_a + convert_layout(tile_a, layout_load_a, layout_dpas_a) + + # B tile load layout + layout_load_b = { + "sg_layout": sg_layout, + "sg_data": sg_tile_b, + "inst_data": load_tile_b, + } + desc_op_b = xegpu.get_desc_op(tile_b) + # B tile load op anchor layout + load_op_b = transform.get_consumers_of_result(anytype, desc_op_b, 0) + xegpu.set_op_layout_attr(load_op_b, **layout_load_b) + # B tile dpas layout + layout_dpas_b = layout_load_b.copy() + layout_dpas_b["inst_data"] = dpas_shape_b + convert_layout(tile_b, layout_load_b, layout_dpas_b) + + # C tile layout + output_layout = { + "sg_layout": sg_layout, + "sg_data": sg_tile, + "inst_data": dpas_shape_c, + } + # C tile dpas anchor layout + xegpu.set_op_layout_attr(dpas_op, index=0, **layout_dpas_a) + xegpu.set_op_layout_attr(dpas_op, index=1, **layout_dpas_b) + xegpu.set_op_layout_attr(dpas_op, index=2, **output_layout) + # annotate store op + store_op_c = match(gpu_func, ops={"xegpu.store_nd"}) + xegpu.set_op_layout_attr(store_op_c, **output_layout) + + if has_bias: + # annotate the 1d load of the broadcast op with a slice layout + add_op = match(gpu_func, ops={"arith.addf"}) + bcast_op = transform.get_producer_of_operand(anytype, add_op, 0) + bcast_load = transform.get_producer_of_operand(anytype, bcast_op, 0) + xegpu.set_op_layout_attr( + bcast_load, result=True, index=0, **output_layout, slice_dims=[0] + ) + raise NotImplementedError("Bias layout propagation is not supported.") + + transform.apply_cse(gpu_func) + canonicalize(gpu_func) + + # hoist desc ops out of reduction loop + transform.apply_licm(k_loop) + + canonicalize(gpu_func) + transform.apply_cse(gpu_func) + + if stop_at_stage == "xegpu-wg": + raise PipelineInterrupt() + + return mod + + +def bundle_xegpu_to_binary(mod, stop_at_stage: str = "") -> ir.Module: + """Schedule for lowering xegpu wg level to binary.""" + # upstream xegpu/xevm pipeline is payload independent. + mod = apply_registered_pass( + mod, "gpu-lower-to-xevm-pipeline", options={"xegpu-op-level": "workgroup"} + ) + + return mod diff --git a/lighthouse/utils/numpy.py b/lighthouse/utils/numpy.py new file mode 100644 index 0000000..6ce67ec --- /dev/null +++ b/lighthouse/utils/numpy.py @@ -0,0 +1,10 @@ +import ctypes + +import numpy as np +from mlir.runtime.np_to_memref import get_ranked_memref_descriptor +from lighthouse.utils.memref import to_ctype + + +def numpy_to_ctype(arr: np.ndarray) -> ctypes._Pointer: + """Convert numpy array to memref and ctypes **void pointer.""" + return to_ctype(get_ranked_memref_descriptor(arr))