From 91b0f0cf4b0cd730d55188092117456a4ec8eca8 Mon Sep 17 00:00:00 2001 From: WestonJB Date: Thu, 5 Feb 2026 12:23:53 -0500 Subject: [PATCH 1/2] Updated axpy to support n > number of threads --- src/level1/axpy_device.mojo | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/level1/axpy_device.mojo b/src/level1/axpy_device.mojo index 7a44154..3cd1232 100644 --- a/src/level1/axpy_device.mojo +++ b/src/level1/axpy_device.mojo @@ -1,21 +1,30 @@ -from gpu import thread_idx +from gpu import grid_dim, block_dim, global_idx fn axpy_device[dtype: DType]( n: Int, - sa: SIMD[dtype, 1], - sx: UnsafePointer[SIMD[dtype, 1], ImmutAnyOrigin], + a: SIMD[dtype, 1], + x: UnsafePointer[SIMD[dtype, 1], ImmutAnyOrigin], incx: Int, - sy: UnsafePointer[SIMD[dtype, 1], MutAnyOrigin], + y: UnsafePointer[SIMD[dtype, 1], MutAnyOrigin], incy: Int ): - var thread_id: UInt = thread_idx.x - if (n <= 0): return - if (sa == 0): + if (a == 0): return if (incx == 0 or incy == 0): return - if (thread_id < n): - sy[thread_id*incy] += sa*sx[thread_id*incx] + var global_i = global_idx.x + var n_threads = grid_dim.x * block_dim.x + + if (n_threads <= global_i): + # Standard case: each thread gets 1 cell + if (global_i < n): + y[global_i*incy] += a * x[global_i*incx] + + else: + # Multiple cells per thread + for i in range(global_i, n, n_threads): + y[i*incy] += a * x[i*incx] + From a764a743fa73427166d5ad7b27d27047a3149ad2 Mon Sep 17 00:00:00 2001 From: WestonJB Date: Thu, 5 Feb 2026 13:20:03 -0500 Subject: [PATCH 2/2] Added scal, copy, swap; fixed axpy bug --- src/__init__.mojo | 7 +- src/level1/__init__.mojo | 7 +- src/level1/axpy_device.mojo | 10 +- src/level1/copy_device.mojo | 26 ++++ src/level1/scal_device.mojo | 28 ++++ src/level1/swap_device.mojo | 30 ++++ test-level1.mojo | 286 ++++++++++++++++++++++++++++-------- 7 files changed, 322 insertions(+), 72 deletions(-) create mode 100644 src/level1/copy_device.mojo create mode 100644 src/level1/scal_device.mojo create mode 100644 src/level1/swap_device.mojo diff --git a/src/__init__.mojo b/src/__init__.mojo index e902d8b..dd68cc0 100644 --- a/src/__init__.mojo +++ b/src/__init__.mojo @@ -1,3 +1,6 @@ -from .level1.iamax_device import * -from .level1.dot_device import * from .level1.axpy_device import * +from .level1.scal_device import * +from .level1.copy_device import * +from .level1.swap_device import * +from .level1.dot_device import * +from .level1.iamax_device import * \ No newline at end of file diff --git a/src/level1/__init__.mojo b/src/level1/__init__.mojo index b8a05ef..0751aeb 100644 --- a/src/level1/__init__.mojo +++ b/src/level1/__init__.mojo @@ -1,3 +1,6 @@ -from .iamax_device import * +from .axpy_device import * +from .scal_device import * +from .copy_device import * +from .swap_device import * from .dot_device import * -from .axpy_device import * \ No newline at end of file +from .iamax_device import * \ No newline at end of file diff --git a/src/level1/axpy_device.mojo b/src/level1/axpy_device.mojo index 3cd1232..4fe7a0d 100644 --- a/src/level1/axpy_device.mojo +++ b/src/level1/axpy_device.mojo @@ -2,10 +2,10 @@ from gpu import grid_dim, block_dim, global_idx fn axpy_device[dtype: DType]( n: Int, - a: SIMD[dtype, 1], - x: UnsafePointer[SIMD[dtype, 1], ImmutAnyOrigin], + a: Scalar[dtype], + x: UnsafePointer[Scalar[dtype], ImmutAnyOrigin], incx: Int, - y: UnsafePointer[SIMD[dtype, 1], MutAnyOrigin], + y: UnsafePointer[Scalar[dtype], MutAnyOrigin], incy: Int ): if (n <= 0): @@ -16,9 +16,9 @@ fn axpy_device[dtype: DType]( return var global_i = global_idx.x - var n_threads = grid_dim.x * block_dim.x + var n_threads = Int(grid_dim.x * block_dim.x) - if (n_threads <= global_i): + if (n <= n_threads): # Standard case: each thread gets 1 cell if (global_i < n): y[global_i*incy] += a * x[global_i*incx] diff --git a/src/level1/copy_device.mojo b/src/level1/copy_device.mojo new file mode 100644 index 0000000..0e75182 --- /dev/null +++ b/src/level1/copy_device.mojo @@ -0,0 +1,26 @@ +from gpu import grid_dim, block_dim, global_idx + +fn copy_device[dtype: DType]( + n: Int, + x: UnsafePointer[Scalar[dtype], ImmutAnyOrigin], + incx: Int, + y: UnsafePointer[Scalar[dtype], MutAnyOrigin], + incy: Int +): + if (n <= 0): + return + if (incx == 0 or incy == 0): + return + + var global_i = global_idx.x + var n_threads = Int(grid_dim.x * block_dim.x) + + if (n <= n_threads): + # Standard case: each thread gets 1 cell + if (global_i < n): + y[global_i*incy] = x[global_i*incx] + + else: + # Multiple cells per thread + for i in range(global_i, n, n_threads): + y[i*incy] = x[i*incx] diff --git a/src/level1/scal_device.mojo b/src/level1/scal_device.mojo new file mode 100644 index 0000000..a236c60 --- /dev/null +++ b/src/level1/scal_device.mojo @@ -0,0 +1,28 @@ +from gpu import grid_dim, block_dim, global_idx + +fn scal_device[dtype: DType]( + n: Int, + a: Scalar[dtype], + x: UnsafePointer[Scalar[dtype], MutAnyOrigin], + incx: Int, +): + if (n <= 0): + return + if (a == 0): + return + if (incx == 0): + return + + var global_i = global_idx.x + var n_threads = Int(grid_dim.x * block_dim.x) + + if (n <= n_threads): + # Standard case: each thread gets 1 cell + if (global_i < n): + x[global_i*incx] *= a + + else: + # Multiple cells per thread + for i in range(global_i, n, n_threads): + x[i*incx] *= a + diff --git a/src/level1/swap_device.mojo b/src/level1/swap_device.mojo new file mode 100644 index 0000000..ad34bbb --- /dev/null +++ b/src/level1/swap_device.mojo @@ -0,0 +1,30 @@ +from gpu import grid_dim, block_dim, global_idx + +fn swap_device[dtype: DType]( + n: Int, + x: UnsafePointer[Scalar[dtype], MutAnyOrigin], + incx: Int, + y: UnsafePointer[Scalar[dtype], MutAnyOrigin], + incy: Int +): + if (n <= 0): + return + if (incx == 0 or incy == 0): + return + + var global_i = global_idx.x + var n_threads = Int(grid_dim.x * block_dim.x) + + if (n <= n_threads): + # Standard case: each thread gets 1 cell + if (global_i < n): + var tmp = x[global_i * incx] + x[global_i * incx] = y[global_i * incy] + y[global_i * incy] = tmp + + else: + # Multiple cells per thread + for i in range(global_i, n, n_threads): + var tmp = x[i * incx] + x[i * incx] = y[i * incy] + y[i * incy] = tmp diff --git a/test-level1.mojo b/test-level1.mojo index 5d9d3cc..e88f9da 100644 --- a/test-level1.mojo +++ b/test-level1.mojo @@ -4,7 +4,7 @@ from gpu.host import DeviceContext from gpu import block_dim, grid_dim, thread_idx from layout import Layout, LayoutTensor -from src import axpy_device, iamax_device, dot_device +from src import axpy_device, scal_device, copy_device, swap_device, dot_device, iamax_device from random import rand, seed from math import ceildiv from python import Python, PythonObject @@ -29,6 +29,210 @@ def generate_random_arr[ for i in range(size): a[i] = min_value + a[i] * rng + +def axpy_test[ + dtype: DType, + size: Int +](): + with DeviceContext() as ctx: + print("[ axpy test:", dtype, "]") + + a: SIMD[dtype, 1] = 0 + x = ctx.enqueue_create_host_buffer[dtype](size) + y = ctx.enqueue_create_host_buffer[dtype](size) + mojo_res = ctx.enqueue_create_host_buffer[dtype](size) + + generate_random_arr[dtype, 1](UnsafePointer[SIMD[dtype, 1]](to=a), -10000, 10000) + generate_random_arr[dtype, size](x.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype, size](y.unsafe_ptr(), -10000, 10000) + # print("a = ", a) + # print("x = ", x) + # print("y = ", y) + + d_x = ctx.enqueue_create_buffer[dtype](size) + d_y = ctx.enqueue_create_buffer[dtype](size) + + ctx.enqueue_copy(d_x, x) + ctx.enqueue_copy(d_y, y) + + axpy_kernel = ctx.compile_function[axpy_device[dtype], axpy_device[dtype]]() + + ctx.enqueue_function( + axpy_kernel, + size, a, d_x, 1, d_y, 1, + grid_dim=1, + block_dim=size + ) + + sp_blas = Python.import_module("scipy.linalg.blas") + builtins = Python.import_module("builtins") + + x_py = Python.list() + y_py = Python.list() + for i in range(size): + x_py.append(x[i]) + y_py.append(y[i]) + + if dtype == DType.float32: + sp_result = sp_blas.saxpy(x_py, y_py, a=a) + elif dtype == DType.float64: + sp_result = sp_blas.daxpy(x_py, y_py, a=a) + else: + print(dtype , " is not supported by SciPy") + return + + ctx.enqueue_copy(mojo_res, d_y) + ctx.synchronize() + + # Prints too much for large vectors. May want to add a verbose option + # print("out:", mojo_res) + # print("expected", sp_result) + + for i in range(size): + assert_almost_equal(Scalar[dtype](py=sp_result[i]), mojo_res[i], atol=atol) + + +def scal_test[ + dtype: DType, + size: Int +](): + with DeviceContext() as ctx: + print("[ scal test:", dtype, "]") + + a: SIMD[dtype, 1] = 0 + x = ctx.enqueue_create_host_buffer[dtype](size) + mojo_res = ctx.enqueue_create_host_buffer[dtype](size) + + generate_random_arr[dtype, 1](UnsafePointer[SIMD[dtype, 1]](to=a), -10000, 10000) + generate_random_arr[dtype, size](x.unsafe_ptr(), -10000, 10000) + # print("a = ", a) + # print("x = ", x) + + d_x = ctx.enqueue_create_buffer[dtype](size) + + ctx.enqueue_copy(d_x, x) + + scal_kernel = ctx.compile_function[scal_device[dtype], scal_device[dtype]]() + + ctx.enqueue_function( + scal_kernel, + size, a, d_x, 1, + grid_dim=1, + block_dim=size + ) + + sp_blas = Python.import_module("scipy.linalg.blas") + builtins = Python.import_module("builtins") + + x_py = Python.list() + for i in range(size): + x_py.append(x[i]) + + if dtype == DType.float32: + sp_result = sp_blas.sscal(a, x_py) + elif dtype == DType.float64: + sp_result = sp_blas.dscal(a, x_py) + else: + print(dtype , " is not supported by SciPy") + return + + ctx.enqueue_copy(mojo_res, d_x) + ctx.synchronize() + + # Prints too much for large vectors. May want to add a verbose option + # print("out:", mojo_res) + # print("expected", sp_result) + + for i in range(size): + assert_almost_equal(Scalar[dtype](py=sp_result[i]), mojo_res[i], atol=atol) + + +def copy_test[ + dtype: DType, + size: Int +](): + with DeviceContext() as ctx: + print("[ copy test:", dtype, "]") + + x = ctx.enqueue_create_host_buffer[dtype](size) + y = ctx.enqueue_create_host_buffer[dtype](size) + + generate_random_arr[dtype, size](x.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype, size](y.unsafe_ptr(), -10000, 10000) + # print("x = ", x) + # print("y = ", y) + + d_x = ctx.enqueue_create_buffer[dtype](size) + d_y = ctx.enqueue_create_buffer[dtype](size) + + ctx.enqueue_copy(d_x, x) + ctx.enqueue_copy(d_y, y) + + copy_kernel = ctx.compile_function[copy_device[dtype], copy_device[dtype]]() + + ctx.enqueue_function( + copy_kernel, + size, d_x, 1, d_y, 1, + grid_dim=1, + block_dim=size + ) + + ctx.enqueue_copy(y, d_y) + ctx.synchronize() + + # Prints too much for large vectors. May want to add a verbose option + # print("out:", mojo_res) + # print("expected", sp_result) + + for i in range(size): + assert_equal(x[i], y[i]) + + +def swap_test[ + dtype: DType, + size: Int +](): + with DeviceContext() as ctx: + print("[ swap test:", dtype, "]") + + x = ctx.enqueue_create_host_buffer[dtype](size) + y = ctx.enqueue_create_host_buffer[dtype](size) + x2 = ctx.enqueue_create_host_buffer[dtype](size) + y2 = ctx.enqueue_create_host_buffer[dtype](size) + + generate_random_arr[dtype, size](x.unsafe_ptr(), -10000, 10000) + generate_random_arr[dtype, size](y.unsafe_ptr(), -10000, 10000) + # print("x = ", x) + # print("y = ", y) + + d_x = ctx.enqueue_create_buffer[dtype](size) + d_y = ctx.enqueue_create_buffer[dtype](size) + + ctx.enqueue_copy(d_x, x) + ctx.enqueue_copy(d_y, y) + + swap_kernel = ctx.compile_function[swap_device[dtype], swap_device[dtype]]() + + ctx.enqueue_function( + swap_kernel, + size, d_x, 1, d_y, 1, + grid_dim=1, + block_dim=size + ) + + ctx.enqueue_copy(x2, d_x) + ctx.enqueue_copy(y2, d_y) + ctx.synchronize() + + # Prints too much for large vectors. May want to add a verbose option + # print("out:", mojo_res) + # print("expected", sp_result) + + for i in range(size): + assert_equal(x[i], y2[i]) + assert_equal(y[i], x2[i]) + + def dot_test[ dtype: DType, size: Int @@ -89,7 +293,7 @@ def dot_test[ np_b = np.array(py_b, dtype=np.float64) sp_res = sp_blas.ddot(np_a, np_b) else: - print(dtype , "is not supported by SciPy") + print(dtype , " is not supported by SciPy") return sp_res_mojo = Scalar[dtype](py=sp_res) @@ -148,7 +352,7 @@ def iamax_test[ np_v = np.array(py_list, dtype=np.float64) sp_res = sp_blas.idamax(np_v) else: - print(dtype , "is not supported by SciPy") + print(dtype , " is not supported by SciPy") return # Move Mojo result from CPU to GPU and compare to SciPy @@ -159,80 +363,36 @@ def iamax_test[ assert_equal(res_mojo[0], sp_res_mojo) -def axpy_test[ - dtype: DType, - size: Int -](): - with DeviceContext() as ctx: - print("[ axpy test:", dtype, "]") +def test_axpy(): + axpy_test[DType.float32, 256]() + axpy_test[DType.float64, 256]() - a: SIMD[dtype, 1] = 0 - x = ctx.enqueue_create_host_buffer[dtype](size) - y = ctx.enqueue_create_host_buffer[dtype](size) - mojo_res = ctx.enqueue_create_host_buffer[dtype](size) - generate_random_arr[dtype, 1](UnsafePointer[SIMD[dtype, 1]](to=a), -10000, 10000) - generate_random_arr[dtype, size](x.unsafe_ptr(), -10000, 10000) - generate_random_arr[dtype, size](y.unsafe_ptr(), -10000, 10000) - # print("a = ", a) - # print("x = ", x) - # print("y = ", y) +def test_scal(): + scal_test[DType.float32, 256]() + scal_test[DType.float64, 256]() - d_x = ctx.enqueue_create_buffer[dtype](size) - d_y = ctx.enqueue_create_buffer[dtype](size) - ctx.enqueue_copy(d_x, x) - ctx.enqueue_copy(d_y, y) +def test_copy(): + copy_test[DType.float32, 256]() + copy_test[DType.float64, 256]() - axpy_kernel = ctx.compile_function[axpy_device[dtype], axpy_device[dtype]]() - ctx.enqueue_function( - axpy_kernel, - size, a, d_x, 1, d_y, 1, - grid_dim=1, - block_dim=size - ) +def test_swap(): + swap_test[DType.float32, 256]() + swap_test[DType.float64, 256]() - sp_blas = Python.import_module("scipy.linalg.blas") - builtins = Python.import_module("builtins") - x_py = Python.list() - y_py = Python.list() - for i in range(size): - x_py.append(x[i]) - y_py.append(y[i]) - - if dtype == DType.float32: - sp_result = sp_blas.saxpy(x_py, y_py, a=a) - elif dtype == DType.float64: - sp_result = sp_blas.daxpy(x_py, y_py, a=a) - else: - print(dtype , "is not supported by SciPy") - return - - ctx.enqueue_copy(mojo_res, d_y) - ctx.synchronize() - - # Prints too much for large vectors. May want to add a verbose option - # print("out:", mojo_res) - # print("expected", sp_result) +def test_dot(): + dot_test[DType.float32, 256]() + # It looks like warp_sum doesn't support float64 + # dot_test[DType.float64, 256]() - for i in range(size): - var f = Scalar[dtype](py=sp_result[i]) - assert_almost_equal(Scalar[dtype](py=sp_result[i]), mojo_res[i], atol=1.0E-6) def test_iamax(): iamax_test[DType.float32, 256]() iamax_test[DType.float64, 256]() -def test_dot(): - dot_test[DType.float32, 256]() - # It looks like warp_sum doesn't support float64 - # dot_test[DType.float64, 256]() - -def test_axpy(): - axpy_test[DType.float32, 256]() - axpy_test[DType.float64, 256]() def main(): print("--- MojoBLAS Level 1 routines testing ---")