From c58048a156350947ff45f7a2e8a5dddc07176513 Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Thu, 20 Nov 2025 19:32:37 +0100 Subject: [PATCH 01/32] Introduce matrix multiplication wrapper function --- include/mlp.h | 1 + src/matrix.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/include/mlp.h b/include/mlp.h index b2a6616..9ef6cdf 100644 --- a/include/mlp.h +++ b/include/mlp.h @@ -63,6 +63,7 @@ LAYER *init_layer(int num_neurons, int num_weights_per_neuron, int batch_size); NETWORK *init_network(int num_inputs, int num_layers, int *num_inputs_per_layer, int batch_size); NEURON *init_neuron(int num_weights); void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b); +void multiply_matrix_naive(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b); uint8_t **read_image_data(const char *filename, int *num_rows, const int num_cols); double sse(double *real, double *ideal, int length); void transpose_matrix(const double *A, double *C, int rows, int cols); diff --git a/src/matrix.c b/src/matrix.c index 6b40dc2..c070de8 100644 --- a/src/matrix.c +++ b/src/matrix.c @@ -1,6 +1,11 @@ #include "mlp.h" void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b) +{ + multiply_matrix_naive(A, B, C, rows_a, cols_a, cols_b); +} + +void multiply_matrix_naive(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b) { for(int i=0; i Date: Thu, 20 Nov 2025 19:50:36 +0100 Subject: [PATCH 02/32] Create `legacy/` & move old UPMEM files inside --- Makefile => legacy/Makefile | 0 generate.py => legacy/generate.py | 0 host.c => legacy/host.c | 0 matmul.template => legacy/matmul.template | 0 matrices.template => legacy/matrices.template | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename Makefile => legacy/Makefile (100%) rename generate.py => legacy/generate.py (100%) rename host.c => legacy/host.c (100%) rename matmul.template => legacy/matmul.template (100%) rename matrices.template => legacy/matrices.template (100%) diff --git a/Makefile b/legacy/Makefile similarity index 100% rename from Makefile rename to legacy/Makefile diff --git a/generate.py b/legacy/generate.py similarity index 100% rename from generate.py rename to legacy/generate.py diff --git a/host.c b/legacy/host.c similarity index 100% rename from host.c rename to legacy/host.c diff --git a/matmul.template b/legacy/matmul.template similarity index 100% rename from matmul.template rename to legacy/matmul.template diff --git a/matrices.template b/legacy/matrices.template similarity index 100% rename from matrices.template rename to legacy/matrices.template From ae403fab97f07eb66247caeaff268421c341b527 Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Thu, 20 Nov 2025 21:25:39 +0100 Subject: [PATCH 03/32] Implement tiled matrix multiplication --- include/mlp.h | 2 ++ src/matrix.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/include/mlp.h b/include/mlp.h index 9ef6cdf..4eef999 100644 --- a/include/mlp.h +++ b/include/mlp.h @@ -31,6 +31,8 @@ #define MOMENTUM 0.8 #endif +#define TILE_SIZE 16 + extern unsigned int rseed; typedef struct { diff --git a/src/matrix.c b/src/matrix.c index c070de8..6355cee 100644 --- a/src/matrix.c +++ b/src/matrix.c @@ -2,7 +2,48 @@ void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b) { - multiply_matrix_naive(A, B, C, rows_a, cols_a, cols_b); + double tileA[TILE_SIZE][TILE_SIZE]; + double tileB[TILE_SIZE][TILE_SIZE]; + double tileC[TILE_SIZE][TILE_SIZE]; + + for(int i=0; i Date: Fri, 21 Nov 2025 08:30:14 +0100 Subject: [PATCH 04/32] Implement UPMEM-distributed matrix multiplication --- dpu_program.c | 32 +++++++++++++++++++ include/upmem.h | 21 +++++++++++++ src/matrix.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 dpu_program.c create mode 100644 include/upmem.h diff --git a/dpu_program.c b/dpu_program.c new file mode 100644 index 0000000..193587e --- /dev/null +++ b/dpu_program.c @@ -0,0 +1,32 @@ +#include +#include +#include "upmem.h" + +__mram_noinit double A_chunk[ROWS_A_PER_DPU_MAX * COLS_A_MAX]; +__mram_noinit double B_whole[COLS_A_MAX * COLS_B_MAX]; +__mram_noinit double C_chunk[ROWS_A_PER_DPU_MAX * COLS_B_MAX]; + +__host dpu_args_t DPU_INPUT_ARGS; + +int main() +{ + dpu_args_t dpu_input_args = DPU_INPUT_ARGS; + uint32_t rows_a = dpu_input_args.rows_a; + uint32_t cols_a = dpu_input_args.cols_a; + uint32_t cols_b = dpu_input_args.cols_b; + + if(!rows_a) + return 0; + + for(int i=0; i + +#define ROWS_A_MAX 96 +#define COLS_A_MAX 96 +#define COLS_B_MAX 96 +#define ROWS_A_PER_DPU_MAX 16 + +#define NUM_DPU 16 + +typedef struct { + uint32_t rows_a; + uint32_t cols_a; + uint32_t cols_b; +} dpu_args_t; + +void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b); + +#endif diff --git a/src/matrix.c b/src/matrix.c index 6355cee..dbc6e9a 100644 --- a/src/matrix.c +++ b/src/matrix.c @@ -1,4 +1,7 @@ +#include +#include #include "mlp.h" +#include "upmem.h" void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b) { @@ -32,7 +35,8 @@ void multiply_matrix(const double *A, const double *B, double *C, int rows_a, in } } - multiply_matrix_naive(&tileA[0][0], &tileB[0][0], &tileC[0][0], TILE_SIZE, TILE_SIZE, TILE_SIZE); + // multiply_matrix_naive(&tileA[0][0], &tileB[0][0], &tileC[0][0], TILE_SIZE, TILE_SIZE, TILE_SIZE); + multiply_matrix_upmem(&tileA[0][0], &tileB[0][0], &tileC[0][0], TILE_SIZE, TILE_SIZE, TILE_SIZE); for(int row=0; row= rows_a) ? 0 + : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start) + : dpu_rows_a_max; + + dpu_args_t args = { + .rows_a = dpu_rows_a_actual, + .cols_a = cols_a, + .cols_b = cols_b + }; + + DPU_ASSERT(dpu_copy_to(dpu, "DPU_INPUT_ARGS", 0, &args, sizeof(args))); + + if(dpu_rows_a_actual) { + uint32_t elems_a = dpu_rows_a_actual * cols_a; + uint32_t bytes_a = elems_a * sizeof(double); + + double *A_chunk = (double*)malloc(bytes_a); + + for(int r=0; r= rows_a) ? 0 + : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start) + : dpu_rows_a_max; + + if(dpu_rows_a_actual) { + uint32_t elems_c = dpu_rows_a_actual * cols_b; + uint32_t bytes_c = elems_c * sizeof(double); + + double *C_chunk = (double*)malloc(bytes_c); + + DPU_ASSERT(dpu_copy_from(dpu, "C_chunk", 0, C_chunk, bytes_c)); + + for(int r=0; r Date: Fri, 21 Nov 2025 08:39:01 +0100 Subject: [PATCH 05/32] Improve file organization --- .gitignore | 1 - include/upmem.h | 2 ++ dpu_program.c => src/dpu/dpu_program.c | 0 src/{ => host}/accumulate_layer_gradients.c | 0 src/{ => host}/activation.c | 0 src/{ => host}/apply_gradients.c | 0 src/{ => host}/drand.c | 0 src/{ => host}/get_delta.c | 0 src/{ => host}/get_total_loss.c | 0 src/{ => host}/get_y.c | 0 src/{ => host}/get_z.c | 0 src/{ => host}/init_layer.c | 0 src/{ => host}/init_network.c | 0 src/{ => host}/init_neuron.c | 0 src/{ => host}/matrix.c | 2 +- src/{ => host}/mlp.c | 0 src/{ => host}/read_image_data.c | 0 src/{ => host}/sse.c | 0 src/{ => host}/utils.c | 0 19 files changed, 3 insertions(+), 2 deletions(-) rename dpu_program.c => src/dpu/dpu_program.c (100%) rename src/{ => host}/accumulate_layer_gradients.c (100%) rename src/{ => host}/activation.c (100%) rename src/{ => host}/apply_gradients.c (100%) rename src/{ => host}/drand.c (100%) rename src/{ => host}/get_delta.c (100%) rename src/{ => host}/get_total_loss.c (100%) rename src/{ => host}/get_y.c (100%) rename src/{ => host}/get_z.c (100%) rename src/{ => host}/init_layer.c (100%) rename src/{ => host}/init_network.c (100%) rename src/{ => host}/init_neuron.c (100%) rename src/{ => host}/matrix.c (98%) rename src/{ => host}/mlp.c (100%) rename src/{ => host}/read_image_data.c (100%) rename src/{ => host}/sse.c (100%) rename src/{ => host}/utils.c (100%) diff --git a/.gitignore b/.gitignore index 24a870c..274d4e3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ matmul.c matrices.h -dpu/ *.o *.out training_images.txt diff --git a/include/upmem.h b/include/upmem.h index 67b3069..a2feb78 100644 --- a/include/upmem.h +++ b/include/upmem.h @@ -10,6 +10,8 @@ #define NUM_DPU 16 +#define DPU_BINARY_PATH "build/dpu_program" + typedef struct { uint32_t rows_a; uint32_t cols_a; diff --git a/dpu_program.c b/src/dpu/dpu_program.c similarity index 100% rename from dpu_program.c rename to src/dpu/dpu_program.c diff --git a/src/accumulate_layer_gradients.c b/src/host/accumulate_layer_gradients.c similarity index 100% rename from src/accumulate_layer_gradients.c rename to src/host/accumulate_layer_gradients.c diff --git a/src/activation.c b/src/host/activation.c similarity index 100% rename from src/activation.c rename to src/host/activation.c diff --git a/src/apply_gradients.c b/src/host/apply_gradients.c similarity index 100% rename from src/apply_gradients.c rename to src/host/apply_gradients.c diff --git a/src/drand.c b/src/host/drand.c similarity index 100% rename from src/drand.c rename to src/host/drand.c diff --git a/src/get_delta.c b/src/host/get_delta.c similarity index 100% rename from src/get_delta.c rename to src/host/get_delta.c diff --git a/src/get_total_loss.c b/src/host/get_total_loss.c similarity index 100% rename from src/get_total_loss.c rename to src/host/get_total_loss.c diff --git a/src/get_y.c b/src/host/get_y.c similarity index 100% rename from src/get_y.c rename to src/host/get_y.c diff --git a/src/get_z.c b/src/host/get_z.c similarity index 100% rename from src/get_z.c rename to src/host/get_z.c diff --git a/src/init_layer.c b/src/host/init_layer.c similarity index 100% rename from src/init_layer.c rename to src/host/init_layer.c diff --git a/src/init_network.c b/src/host/init_network.c similarity index 100% rename from src/init_network.c rename to src/host/init_network.c diff --git a/src/init_neuron.c b/src/host/init_neuron.c similarity index 100% rename from src/init_neuron.c rename to src/host/init_neuron.c diff --git a/src/matrix.c b/src/host/matrix.c similarity index 98% rename from src/matrix.c rename to src/host/matrix.c index dbc6e9a..12220b6 100644 --- a/src/matrix.c +++ b/src/host/matrix.c @@ -68,7 +68,7 @@ void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows struct dpu_set_t dpus, dpu; DPU_ASSERT(dpu_alloc(NUM_DPU, NULL, &dpus)); - DPU_ASSERT(dpu_load(dpus, "build/dpu_program", NULL)); + DPU_ASSERT(dpu_load(dpus, DPU_BINARY_PATH, NULL)); uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU; diff --git a/src/mlp.c b/src/host/mlp.c similarity index 100% rename from src/mlp.c rename to src/host/mlp.c diff --git a/src/read_image_data.c b/src/host/read_image_data.c similarity index 100% rename from src/read_image_data.c rename to src/host/read_image_data.c diff --git a/src/sse.c b/src/host/sse.c similarity index 100% rename from src/sse.c rename to src/host/sse.c diff --git a/src/utils.c b/src/host/utils.c similarity index 100% rename from src/utils.c rename to src/host/utils.c From 60f14325b952e2084504dd93de032be8eb9f1148 Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Fri, 21 Nov 2025 09:50:09 +0100 Subject: [PATCH 06/32] Add Makefile --- Makefile | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 Makefile diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7b7fdb4 --- /dev/null +++ b/Makefile @@ -0,0 +1,11 @@ +DPU_UPMEM_CLANG = dpu-upmem-dpurte-clang +DPU_UPMEM_CFLAGS += +FILES_TO_DELETE = build/ + +all: clean + mkdir build; \ + $(DPU_UPMEM_CLANG) $(DPU_UPMEM_CFLAGS) -Iinclude -o build/dpu_program src/dpu/dpu_program.c; \ + gcc -std=c99 -O0 -Iinclude src/host/*.c -D_GNU_SOURCE -DVERBOSE -DDEBUG -DBATCH_SIZE=2 -DMAX_EPOCH=8 -DNUM_TRAIN_SAMPLES=40 -o build/mlp -lm `dpu-pkg-config --cflags --libs dpu` + +clean: + rm -rf $(FILES_TO_DELETE) From dd5d1cdb0aeb1e6f37bb6aef21639229d69bb6bc Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Fri, 21 Nov 2025 14:02:54 +0100 Subject: [PATCH 07/32] Move `dpu_load` to improve runtime --- src/host/matrix.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/host/matrix.c b/src/host/matrix.c index 12220b6..bb66fe3 100644 --- a/src/host/matrix.c +++ b/src/host/matrix.c @@ -68,12 +68,12 @@ void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows struct dpu_set_t dpus, dpu; DPU_ASSERT(dpu_alloc(NUM_DPU, NULL, &dpus)); - DPU_ASSERT(dpu_load(dpus, DPU_BINARY_PATH, NULL)); - - uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU; uint32_t dpu_idx = 0; DPU_FOREACH(dpus, dpu) { + DPU_ASSERT(dpu_load(dpu, DPU_BINARY_PATH, NULL)); + + uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU; uint32_t row_start = dpu_idx * dpu_rows_a_max; uint32_t dpu_rows_a_actual = (row_start >= rows_a) ? 0 : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start) @@ -112,6 +112,7 @@ void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows dpu_idx = 0; DPU_FOREACH(dpus, dpu) { + uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU; uint32_t row_start = dpu_idx * dpu_rows_a_max; uint32_t dpu_rows_a_actual = (row_start >= rows_a) ? 0 : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start) @@ -136,7 +137,7 @@ void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows dpu_idx++; } - dpu_free(dpus); + DPU_ASSERT(dpu_free(dpus)); } void transpose_matrix(const double* A, double *C, int rows, int cols) From bf0b070d90b01627c2398f6a5906031c7d8888c0 Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Fri, 21 Nov 2025 14:28:55 +0100 Subject: [PATCH 08/32] Broadcast matrix `B` to DPUs instead of copying it --- src/host/matrix.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/host/matrix.c b/src/host/matrix.c index bb66fe3..f6edbfc 100644 --- a/src/host/matrix.c +++ b/src/host/matrix.c @@ -68,12 +68,16 @@ void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows struct dpu_set_t dpus, dpu; DPU_ASSERT(dpu_alloc(NUM_DPU, NULL, &dpus)); + DPU_ASSERT(dpu_load(dpus, DPU_BINARY_PATH, NULL)); + + unsigned int bytes_b = cols_a * cols_b * sizeof(double); + DPU_ASSERT(dpu_broadcast_to(dpus, "B_whole", 0, B, bytes_b, DPU_XFER_DEFAULT)); + + uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU; uint32_t dpu_idx = 0; DPU_FOREACH(dpus, dpu) { - DPU_ASSERT(dpu_load(dpu, DPU_BINARY_PATH, NULL)); - uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU; uint32_t row_start = dpu_idx * dpu_rows_a_max; uint32_t dpu_rows_a_actual = (row_start >= rows_a) ? 0 : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start) @@ -102,9 +106,6 @@ void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows free(A_chunk); } - unsigned int bytes_b = cols_a * cols_b * sizeof(double); - DPU_ASSERT(dpu_copy_to(dpu, "B_whole", 0, B, bytes_b)); - dpu_idx++; } @@ -112,7 +113,7 @@ void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows dpu_idx = 0; DPU_FOREACH(dpus, dpu) { - uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU; + uint32_t row_start = dpu_idx * dpu_rows_a_max; uint32_t dpu_rows_a_actual = (row_start >= rows_a) ? 0 : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start) From 05d42ca5c93a3e5ce64eb2e4c8fcfdb7efbc507b Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Fri, 21 Nov 2025 15:23:43 +0100 Subject: [PATCH 09/32] Move DPU allocation and loading before tiling --- src/host/matrix.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/host/matrix.c b/src/host/matrix.c index f6edbfc..d9836cc 100644 --- a/src/host/matrix.c +++ b/src/host/matrix.c @@ -3,6 +3,8 @@ #include "mlp.h" #include "upmem.h" +struct dpu_set_t dpus, dpu; + void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b) { double tileA[TILE_SIZE][TILE_SIZE]; @@ -15,6 +17,9 @@ void multiply_matrix(const double *A, const double *B, double *C, int rows_a, in } } + DPU_ASSERT(dpu_alloc(NUM_DPU, NULL, &dpus)); + DPU_ASSERT(dpu_load(dpus, DPU_BINARY_PATH, NULL)); + for(int ii=0; ii Date: Fri, 21 Nov 2025 15:42:22 +0100 Subject: [PATCH 10/32] Reorganize matrix multiplication functions --- include/upmem.h | 1 + src/host/matrix.c | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/upmem.h b/include/upmem.h index a2feb78..09ae4e5 100644 --- a/include/upmem.h +++ b/include/upmem.h @@ -19,5 +19,6 @@ typedef struct { } dpu_args_t; void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b); +void process_tile_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b); #endif diff --git a/src/host/matrix.c b/src/host/matrix.c index d9836cc..f696e79 100644 --- a/src/host/matrix.c +++ b/src/host/matrix.c @@ -6,6 +6,12 @@ struct dpu_set_t dpus, dpu; void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b) +{ + multiply_matrix_upmem(A, B, C, rows_a, cols_a, cols_b); + // multiply_matrix_naive(A, B, C, rows_a, cols_a, cols_b); +} + +void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b) { double tileA[TILE_SIZE][TILE_SIZE]; double tileB[TILE_SIZE][TILE_SIZE]; @@ -40,8 +46,7 @@ void multiply_matrix(const double *A, const double *B, double *C, int rows_a, in } } - // multiply_matrix_naive(&tileA[0][0], &tileB[0][0], &tileC[0][0], TILE_SIZE, TILE_SIZE, TILE_SIZE); - multiply_matrix_upmem(&tileA[0][0], &tileB[0][0], &tileC[0][0], TILE_SIZE, TILE_SIZE, TILE_SIZE); + process_tile_upmem(&tileA[0][0], &tileB[0][0], &tileC[0][0], TILE_SIZE, TILE_SIZE, TILE_SIZE); for(int row=0; row Date: Fri, 21 Nov 2025 16:17:36 +0100 Subject: [PATCH 11/32] Add feature toggle for UPMEM to `multiply_matrix` --- src/host/matrix.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/host/matrix.c b/src/host/matrix.c index f696e79..f8b30f1 100644 --- a/src/host/matrix.c +++ b/src/host/matrix.c @@ -7,8 +7,11 @@ struct dpu_set_t dpus, dpu; void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b) { +#ifdef UPMEM multiply_matrix_upmem(A, B, C, rows_a, cols_a, cols_b); - // multiply_matrix_naive(A, B, C, rows_a, cols_a, cols_b); +#else + multiply_matrix_naive(A, B, C, rows_a, cols_a, cols_b); +#endif } void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b) From c72628bf9332aed46e432f3f2454820733995a9f Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Fri, 21 Nov 2025 16:23:02 +0100 Subject: [PATCH 12/32] Adapt unit tests for updates in `multiply_matrix` --- CMakeLists.txt | 31 ++++++++++++++++++++++++++++--- include/upmem.h | 4 ++++ tests/test_matrix.c | 15 ++++++++++++--- 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d408d10..ee5fc92 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,18 +6,43 @@ set(CMAKE_C_STANDARD_REQUIRED ON) include_directories(include) -file(GLOB SRC_FILES src/*.c) -list(REMOVE_ITEM SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/mlp.c") +file(GLOB SRC_FILES src/host/*.c) +list(REMOVE_ITEM SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/host/mlp.c") file(GLOB TEST_FILES tests/*.c) +execute_process( + COMMAND dpu-pkg-config --cflags dpu + OUTPUT_VARIABLE DPU_C_FLAGS + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +execute_process( + COMMAND dpu-pkg-config --libs dpu + OUTPUT_VARIABLE DPU_LIBS + OUTPUT_STRIP_TRAILING_WHITESPACE +) + enable_testing() +add_custom_target(build_dpu_program ALL + COMMAND dpu-upmem-dpurte-clang + -I${CMAKE_SOURCE_DIR}/include + -o ${CMAKE_BINARY_DIR}/dpu_program + ${CMAKE_SOURCE_DIR}/src/dpu/dpu_program.c +) + +add_compile_definitions( + NUM_DPU=1 + DPU_BINARY_PATH=\"./dpu_program\" +) + foreach(TEST_SRC ${TEST_FILES}) get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE) add_executable(${TEST_NAME} ${TEST_SRC} ${SRC_FILES}) target_include_directories(${TEST_NAME} PRIVATE include) - target_link_libraries(${TEST_NAME} m) + target_compile_options(${TEST_NAME} PRIVATE ${DPU_C_FLAGS}) + target_link_libraries(${TEST_NAME} PRIVATE m ${DPU_LIBS}) add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) endforeach() \ No newline at end of file diff --git a/include/upmem.h b/include/upmem.h index 09ae4e5..aee5348 100644 --- a/include/upmem.h +++ b/include/upmem.h @@ -8,9 +8,13 @@ #define COLS_B_MAX 96 #define ROWS_A_PER_DPU_MAX 16 +#ifndef NUM_DPU #define NUM_DPU 16 +#endif +#ifndef DPU_BINARY_PATH #define DPU_BINARY_PATH "build/dpu_program" +#endif typedef struct { uint32_t rows_a; diff --git a/tests/test_matrix.c b/tests/test_matrix.c index 25323e9..b77650a 100644 --- a/tests/test_matrix.c +++ b/tests/test_matrix.c @@ -12,20 +12,29 @@ int test_multiply_matrix() 3.0, 3.0, 4.0, 0.0}; - // result matrix (initialized with random double values [0.0, 20.0]) + // result matrices (initialized with random double values [0.0, 20.0]) double matrixC[2*2]; + double matrixD[2*2]; for(int i=0; i<2*2; i++) { matrixC[i] = ((double)rand() / (double)RAND_MAX) * 20; + matrixD[i] = ((double)rand() / (double)RAND_MAX) * 20; } // ideal result double matrixR[2*2] = {20.0, 12.0, 39.0, 15.0}; - multiply_matrix(matrixA, matrixB, matrixC, 2, 3, 2); + multiply_matrix_naive(matrixA, matrixB, matrixC, 2, 3, 2); + + multiply_matrix_upmem(matrixA, matrixB, matrixD, 2, 3, 2); + + for(int i=0; i<2*2; i++) { + printf("%lf ", matrixC[i]); + } for(int i=0; i<2*2; i++) { - test_result_pass_fail |= matrixC[i] == matrixR[i]; + test_result_pass_fail &= matrixC[i] == matrixR[i]; + test_result_pass_fail &= matrixC[i] == matrixD[i]; } return test_result_pass_fail; From ed06cc39aedc1677d2ea2547dbb6d99a24959c8a Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Fri, 21 Nov 2025 19:17:11 +0100 Subject: [PATCH 13/32] Reorganize header files & macros --- CMakeLists.txt | 3 ++- include/mlp.h | 2 -- include/upmem.h | 19 ++++++++++++------- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ee5fc92..26c5fc9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,8 @@ add_custom_target(build_dpu_program ALL ) add_compile_definitions( - NUM_DPU=1 + # NUM_CPU=1 Important: This macro override was commented because it does not apply to the dpu-upmem-dpurte-clang execution above; and therefore causes mismatch between + # dpu_program.c and the rest. So this file should avoid modifying dimensions set through macros in aforementioned header files. DPU_BINARY_PATH=\"./dpu_program\" ) diff --git a/include/mlp.h b/include/mlp.h index 4eef999..9ef6cdf 100644 --- a/include/mlp.h +++ b/include/mlp.h @@ -31,8 +31,6 @@ #define MOMENTUM 0.8 #endif -#define TILE_SIZE 16 - extern unsigned int rseed; typedef struct { diff --git a/include/upmem.h b/include/upmem.h index aee5348..3464057 100644 --- a/include/upmem.h +++ b/include/upmem.h @@ -3,19 +3,24 @@ #include -#define ROWS_A_MAX 96 -#define COLS_A_MAX 96 -#define COLS_B_MAX 96 -#define ROWS_A_PER_DPU_MAX 16 +#define ROWS_A_MAX 320 +#define COLS_A_MAX 320 +#define COLS_B_MAX 320 + +#ifndef DPU_BINARY_PATH +#define DPU_BINARY_PATH "build/dpu_program" +#endif #ifndef NUM_DPU -#define NUM_DPU 16 +#define NUM_DPU 64 #endif -#ifndef DPU_BINARY_PATH -#define DPU_BINARY_PATH "build/dpu_program" +#ifndef TILE_SIZE +#define TILE_SIZE 128 #endif +#define ROWS_A_PER_DPU_MAX ((ROWS_A_MAX + NUM_DPU - 1) / NUM_DPU) + typedef struct { uint32_t rows_a; uint32_t cols_a; From 087f98ddcb07e2e30ce79ba89df2be59ee6b6287 Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Fri, 21 Nov 2025 19:17:32 +0100 Subject: [PATCH 14/32] Add some assertions to `multiply_matrix_upmem` --- src/host/matrix.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/host/matrix.c b/src/host/matrix.c index f8b30f1..22cfb28 100644 --- a/src/host/matrix.c +++ b/src/host/matrix.c @@ -1,3 +1,4 @@ +#include #include #include #include "mlp.h" @@ -16,6 +17,8 @@ void multiply_matrix(const double *A, const double *B, double *C, int rows_a, in void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b) { + assert(TILE_SIZE / NUM_DPU <= ROWS_A_PER_DPU_MAX); + double tileA[TILE_SIZE][TILE_SIZE]; double tileB[TILE_SIZE][TILE_SIZE]; double tileC[TILE_SIZE][TILE_SIZE]; @@ -80,6 +83,9 @@ void multiply_matrix_naive(const double *A, const double *B, double *C, int rows void process_tile_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b) { + assert(rows_a <= ROWS_A_MAX); + assert(cols_a <= COLS_A_MAX); + assert(cols_b <= COLS_B_MAX); unsigned int bytes_b = cols_a * cols_b * sizeof(double); DPU_ASSERT(dpu_broadcast_to(dpus, "B_whole", 0, B, bytes_b, DPU_XFER_DEFAULT)); From 8d02ab460f85c3152d02b4b08aafb768f3329926 Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Fri, 21 Nov 2025 22:03:50 +0100 Subject: [PATCH 15/32] Shorten runtime --- Makefile | 2 +- src/host/mlp.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 7b7fdb4..d3b345b 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ FILES_TO_DELETE = build/ all: clean mkdir build; \ $(DPU_UPMEM_CLANG) $(DPU_UPMEM_CFLAGS) -Iinclude -o build/dpu_program src/dpu/dpu_program.c; \ - gcc -std=c99 -O0 -Iinclude src/host/*.c -D_GNU_SOURCE -DVERBOSE -DDEBUG -DBATCH_SIZE=2 -DMAX_EPOCH=8 -DNUM_TRAIN_SAMPLES=40 -o build/mlp -lm `dpu-pkg-config --cflags --libs dpu` + gcc -std=c99 -Iinclude src/host/*.c -DUPMEM -D_GNU_SOURCE -DVERBOSE -DDEBUG -DBATCH_SIZE=2 -DMAX_EPOCH=2 -DNUM_TRAIN_SAMPLES=8 -o build/mlp -lm `dpu-pkg-config --cflags --libs dpu` clean: rm -rf $(FILES_TO_DELETE) diff --git a/src/host/mlp.c b/src/host/mlp.c index 012b37d..6a6036b 100644 --- a/src/host/mlp.c +++ b/src/host/mlp.c @@ -11,8 +11,8 @@ int main() int epoch = 0; int num_inputs = NUM_FEATURES; - int num_layers = 5; - int num_neurons_per_layer[] = {NUM_FEATURES, 1000, 1000, 100, NUM_LABELS}; + int num_layers = 3; + int num_neurons_per_layer[] = {NUM_FEATURES, 10, NUM_LABELS}; NETWORK *n = init_network(num_inputs, num_layers, num_neurons_per_layer, BATCH_SIZE); if(!n) { From 8a7b83a0e2ba57caa5432bd58b72f50cc48f5be1 Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Fri, 21 Nov 2025 22:04:16 +0100 Subject: [PATCH 16/32] Adapt CI for UPMEM toolchain --- .github/build_upmem_toolchain.sh | 7 +++++++ .github/workflows/unit_tests.yaml | 20 +++++++++++++++----- .github/workflows/valgrind.yaml | 8 ++++++-- 3 files changed, 28 insertions(+), 7 deletions(-) create mode 100644 .github/build_upmem_toolchain.sh diff --git a/.github/build_upmem_toolchain.sh b/.github/build_upmem_toolchain.sh new file mode 100644 index 0000000..cf7b9ab --- /dev/null +++ b/.github/build_upmem_toolchain.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +cd /opt/ +git clone https://github.com/kagandikmen/upmem-sdk.git +tar -xvf upmem-sdk/upmem-2024.2.0-Linux-x86_64.tar.gz +mv upmem-2024.2.0-Linux-x86_64/ /usr/local/bin/ +rm -rf upmem-sdk/ \ No newline at end of file diff --git a/.github/workflows/unit_tests.yaml b/.github/workflows/unit_tests.yaml index cba3dcc..8f057a3 100644 --- a/.github/workflows/unit_tests.yaml +++ b/.github/workflows/unit_tests.yaml @@ -15,19 +15,29 @@ jobs: submodules: 'recursive' - name: Install dependencies - run: sudo apt update && sudo apt install -y build-essential + run: | + sudo apt update && sudo apt install -y build-essential + sudo ./.github/build_upmem_toolchain.sh - name: Create build directory - run: mkdir build + run: | + source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh + mkdir build - name: Run CMake working-directory: build - run: cmake .. + run: | + source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh + cmake .. - name: Build working-directory: build - run: make + run: | + source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh + make - name: Run the tests working-directory: build - run: make test \ No newline at end of file + run: | + source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh + make test \ No newline at end of file diff --git a/.github/workflows/valgrind.yaml b/.github/workflows/valgrind.yaml index 69cfa97..f38c27a 100644 --- a/.github/workflows/valgrind.yaml +++ b/.github/workflows/valgrind.yaml @@ -19,21 +19,25 @@ jobs: sudo apt update sudo apt install -y build-essential valgrind pip3 install numpy + sudo ./.github/build_upmem_toolchain.sh - name: Extract training samples & labels run: python3 read_dataset.py - name: Compile MLP - run: gcc -g -DEPSILON=0.5 -DNUM_TRAIN_SAMPLES=2 -Iinclude src/*.c -o mlp -lm + run: | + source /usr/local/bin/upmem-sdk/upmem_env.sh + make - name: Run Valgrind run: | + source /usr/local/bin/upmem-sdk/upmem_env.sh valgrind --leak-check=full \ --show-leak-kinds=all \ --track-origins=yes \ --error-exitcode=1 \ --log-file=valgrind.txt \ - ./mlp > /dev/null + ./build/mlp > /dev/null - name: Save Valgrind log if: always() From 5b1646a458fdfb57d55aad1e5bcbe8d1f8bb09c9 Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Fri, 21 Nov 2025 22:08:40 +0100 Subject: [PATCH 17/32] Update README --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a4acb69..7deb945 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ python3 read_dataset.py 3. Compile the application: ```bash -gcc -Iinclude src/*.c -o mlp -lm +make ``` With this command, you can use: @@ -86,11 +86,11 @@ With this command, you can use: ## Status -UPMEM-MLP is a work in progress as of 2025-11-14. +UPMEM-MLP is a work in progress as of 2025-11-21. ### To-Do -- [ ] Adapt `multiply_matrix` for in-memory matrix multiplication on UPMEM +- [ ] Evaluate and document acceleration achieved by matrix multiplication on UPMEM DIMM ## License From 404ab1d0dc7f5785e7c1472cc369da41def2d51f Mon Sep 17 00:00:00 2001 From: Kagan Dikmen <136203535+kagandikmen@users.noreply.github.com> Date: Sat, 22 Nov 2025 09:07:40 +0100 Subject: [PATCH 18/32] Debug CI (#1) * Debug CI commit 1 * Debug CI commit 2 * Debug CI commit 3 * Debug CI commit 4 * Debug CI commit 5 * Debug CI commit 6 * Debug CI commit 7 * Debug CI commit 8 * Debug CI commit 9 * Debug CI commit 10 * Debug CI commit 11 * Debug CI commit 12 * Debug CI commit 13 * Debug CI commit 14 * Debug CI commit 15 * Debug CI commit 16 * Debug CI commit 17 --- .../{valgrind.yaml => memory_leak_tests.yaml} | 27 +++++++++++++------ .github/workflows/unit_tests.yaml | 10 +++---- Makefile | 15 +++++++++-- 3 files changed, 36 insertions(+), 16 deletions(-) rename .github/workflows/{valgrind.yaml => memory_leak_tests.yaml} (54%) diff --git a/.github/workflows/valgrind.yaml b/.github/workflows/memory_leak_tests.yaml similarity index 54% rename from .github/workflows/valgrind.yaml rename to .github/workflows/memory_leak_tests.yaml index f38c27a..e6a8ba3 100644 --- a/.github/workflows/valgrind.yaml +++ b/.github/workflows/memory_leak_tests.yaml @@ -1,4 +1,4 @@ -name: Valgrind +name: Memory Leak Tests on: push: @@ -6,7 +6,7 @@ on: jobs: memcheck: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout repository @@ -19,19 +19,19 @@ jobs: sudo apt update sudo apt install -y build-essential valgrind pip3 install numpy - sudo ./.github/build_upmem_toolchain.sh + sudo bash .github/build_upmem_toolchain.sh - name: Extract training samples & labels run: python3 read_dataset.py - - name: Compile MLP + - name: Compile MLP without sanitizer or UPMEM run: | - source /usr/local/bin/upmem-sdk/upmem_env.sh - make + source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator + make SAN=0 UPMEM=0 - name: Run Valgrind run: | - source /usr/local/bin/upmem-sdk/upmem_env.sh + source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator valgrind --leak-check=full \ --show-leak-kinds=all \ --track-origins=yes \ @@ -44,4 +44,15 @@ jobs: uses: actions/upload-artifact@v4 with: name: valgrind_log - path: valgrind.txt \ No newline at end of file + path: valgrind.txt + + - name: Compile MLP with sanitizer and UPMEM + run: | + source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator + make clean + make SAN=1 UPMEM=1 + + - name: Run with sanitizer + run: | + source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator + ./build/mlp > /dev/null \ No newline at end of file diff --git a/.github/workflows/unit_tests.yaml b/.github/workflows/unit_tests.yaml index 8f057a3..897d053 100644 --- a/.github/workflows/unit_tests.yaml +++ b/.github/workflows/unit_tests.yaml @@ -6,7 +6,7 @@ on: jobs: build-and-test: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout repository @@ -16,13 +16,11 @@ jobs: - name: Install dependencies run: | - sudo apt update && sudo apt install -y build-essential - sudo ./.github/build_upmem_toolchain.sh + sudo apt update && sudo apt install -y build-essential python3.10 python3.10-dev + sudo bash .github/build_upmem_toolchain.sh - name: Create build directory - run: | - source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh - mkdir build + run: mkdir build - name: Run CMake working-directory: build diff --git a/Makefile b/Makefile index d3b345b..6b1b21a 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,22 @@ DPU_UPMEM_CLANG = dpu-upmem-dpurte-clang DPU_UPMEM_CFLAGS += +CFLAGS += -std=c99 -Iinclude -D_GNU_SOURCE -DVERBOSE -DDEBUG -DBATCH_SIZE=2 -DMAX_EPOCH=2 -DNUM_TRAIN_SAMPLES=8 FILES_TO_DELETE = build/ -all: clean +UPMEM ?= 1 +ifeq ($(UPMEM), 1) + CFLAGS += -DUPMEM +endif + +SAN ?= 0 +ifeq ($(SAN), 1) + CFLAGS += -fsanitize=address,undefined,leak -fno-omit-frame-pointer -g +endif + +all: mkdir build; \ $(DPU_UPMEM_CLANG) $(DPU_UPMEM_CFLAGS) -Iinclude -o build/dpu_program src/dpu/dpu_program.c; \ - gcc -std=c99 -Iinclude src/host/*.c -DUPMEM -D_GNU_SOURCE -DVERBOSE -DDEBUG -DBATCH_SIZE=2 -DMAX_EPOCH=2 -DNUM_TRAIN_SAMPLES=8 -o build/mlp -lm `dpu-pkg-config --cflags --libs dpu` + gcc src/host/*.c $(CFLAGS) -o build/mlp -lm `dpu-pkg-config --cflags --libs dpu` clean: rm -rf $(FILES_TO_DELETE) From d0d4be897ecd0031a60d98a168a7daac71d8d521 Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Sat, 22 Nov 2025 10:36:52 +0100 Subject: [PATCH 19/32] Add `_dpus` & improve organization --- include/upmem.h | 4 ++ src/host/dpu_host.c | 144 ++++++++++++++++++++++++++++++++++++++++++++ src/host/matrix.c | 134 +---------------------------------------- src/host/mlp.c | 6 ++ tests/test_matrix.c | 3 + 5 files changed, 158 insertions(+), 133 deletions(-) create mode 100644 src/host/dpu_host.c diff --git a/include/upmem.h b/include/upmem.h index 3464057..4b90e89 100644 --- a/include/upmem.h +++ b/include/upmem.h @@ -27,6 +27,10 @@ typedef struct { uint32_t cols_b; } dpu_args_t; +extern int upmem_initialized; + +void free_dpus(); +void init_dpus(); void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b); void process_tile_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b); diff --git a/src/host/dpu_host.c b/src/host/dpu_host.c new file mode 100644 index 0000000..e6a767b --- /dev/null +++ b/src/host/dpu_host.c @@ -0,0 +1,144 @@ +#include +#include +#include "upmem.h" + +struct dpu_set_t dpus, dpu; +int upmem_initialized = 0; + +void free_dpus() +{ + DPU_ASSERT(dpu_free(dpus)); +} + +void init_dpus() +{ + if(!upmem_initialized) { + assert(TILE_SIZE / NUM_DPU <= ROWS_A_PER_DPU_MAX); + + DPU_ASSERT(dpu_alloc(NUM_DPU, NULL, &dpus)); + DPU_ASSERT(dpu_load(dpus, DPU_BINARY_PATH, NULL)); + + upmem_initialized = 1; + } +} + +void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b) +{ + double tileA[TILE_SIZE][TILE_SIZE]; + double tileB[TILE_SIZE][TILE_SIZE]; + double tileC[TILE_SIZE][TILE_SIZE]; + + for(int i=0; i= rows_a) ? 0 + : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start) + : dpu_rows_a_max; + + dpu_args_t args = { + .rows_a = dpu_rows_a_actual, + .cols_a = cols_a, + .cols_b = cols_b + }; + + DPU_ASSERT(dpu_copy_to(dpu, "DPU_INPUT_ARGS", 0, &args, sizeof(args))); + + if(dpu_rows_a_actual) { + uint32_t elems_a = dpu_rows_a_actual * cols_a; + uint32_t bytes_a = elems_a * sizeof(double); + + double *A_chunk = (double*)malloc(bytes_a); + + for(int r=0; r= rows_a) ? 0 + : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start) + : dpu_rows_a_max; + + if(dpu_rows_a_actual) { + uint32_t elems_c = dpu_rows_a_actual * cols_b; + uint32_t bytes_c = elems_c * sizeof(double); + + double *C_chunk = (double*)malloc(bytes_c); + + DPU_ASSERT(dpu_copy_from(dpu, "C_chunk", 0, C_chunk, bytes_c)); + + for(int r=0; r -#include -#include #include "mlp.h" #include "upmem.h" -struct dpu_set_t dpus, dpu; - void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b) { #ifdef UPMEM + init_dpus(); multiply_matrix_upmem(A, B, C, rows_a, cols_a, cols_b); #else multiply_matrix_naive(A, B, C, rows_a, cols_a, cols_b); #endif } -void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b) -{ - assert(TILE_SIZE / NUM_DPU <= ROWS_A_PER_DPU_MAX); - - double tileA[TILE_SIZE][TILE_SIZE]; - double tileB[TILE_SIZE][TILE_SIZE]; - double tileC[TILE_SIZE][TILE_SIZE]; - - for(int i=0; i= rows_a) ? 0 - : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start) - : dpu_rows_a_max; - - dpu_args_t args = { - .rows_a = dpu_rows_a_actual, - .cols_a = cols_a, - .cols_b = cols_b - }; - - DPU_ASSERT(dpu_copy_to(dpu, "DPU_INPUT_ARGS", 0, &args, sizeof(args))); - - if(dpu_rows_a_actual) { - uint32_t elems_a = dpu_rows_a_actual * cols_a; - uint32_t bytes_a = elems_a * sizeof(double); - - double *A_chunk = (double*)malloc(bytes_a); - - for(int r=0; r= rows_a) ? 0 - : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start) - : dpu_rows_a_max; - - if(dpu_rows_a_actual) { - uint32_t elems_c = dpu_rows_a_actual * cols_b; - uint32_t bytes_c = elems_c * sizeof(double); - - double *C_chunk = (double*)malloc(bytes_c); - - DPU_ASSERT(dpu_copy_from(dpu, "C_chunk", 0, C_chunk, bytes_c)); - - for(int r=0; r Date: Sat, 22 Nov 2025 13:38:46 +0100 Subject: [PATCH 20/32] Parallelize DPU workload through tasklets --- Makefile | 2 +- include/upmem.h | 2 +- src/dpu/dpu_program.c | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 6b1b21a..5c9f6da 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ DPU_UPMEM_CLANG = dpu-upmem-dpurte-clang -DPU_UPMEM_CFLAGS += +DPU_UPMEM_CFLAGS += -DNR_TASKLETS=4 CFLAGS += -std=c99 -Iinclude -D_GNU_SOURCE -DVERBOSE -DDEBUG -DBATCH_SIZE=2 -DMAX_EPOCH=2 -DNUM_TRAIN_SAMPLES=8 FILES_TO_DELETE = build/ diff --git a/include/upmem.h b/include/upmem.h index 4b90e89..47a32d5 100644 --- a/include/upmem.h +++ b/include/upmem.h @@ -16,7 +16,7 @@ #endif #ifndef TILE_SIZE -#define TILE_SIZE 128 +#define TILE_SIZE 256 #endif #define ROWS_A_PER_DPU_MAX ((ROWS_A_MAX + NUM_DPU - 1) / NUM_DPU) diff --git a/src/dpu/dpu_program.c b/src/dpu/dpu_program.c index 193587e..fe3f470 100644 --- a/src/dpu/dpu_program.c +++ b/src/dpu/dpu_program.c @@ -18,7 +18,10 @@ int main() if(!rows_a) return 0; - for(int i=0; i Date: Sat, 22 Nov 2025 14:48:53 +0100 Subject: [PATCH 21/32] Simplify macro scheme --- include/upmem.h | 10 ++-------- src/dpu/dpu_program.c | 6 +++--- src/host/dpu_host.c | 6 ------ 3 files changed, 5 insertions(+), 17 deletions(-) diff --git a/include/upmem.h b/include/upmem.h index 47a32d5..24df7a8 100644 --- a/include/upmem.h +++ b/include/upmem.h @@ -3,24 +3,18 @@ #include -#define ROWS_A_MAX 320 -#define COLS_A_MAX 320 -#define COLS_B_MAX 320 - #ifndef DPU_BINARY_PATH #define DPU_BINARY_PATH "build/dpu_program" #endif #ifndef NUM_DPU -#define NUM_DPU 64 +#define NUM_DPU 8 #endif #ifndef TILE_SIZE -#define TILE_SIZE 256 +#define TILE_SIZE 32 #endif -#define ROWS_A_PER_DPU_MAX ((ROWS_A_MAX + NUM_DPU - 1) / NUM_DPU) - typedef struct { uint32_t rows_a; uint32_t cols_a; diff --git a/src/dpu/dpu_program.c b/src/dpu/dpu_program.c index fe3f470..32697e3 100644 --- a/src/dpu/dpu_program.c +++ b/src/dpu/dpu_program.c @@ -2,9 +2,9 @@ #include #include "upmem.h" -__mram_noinit double A_chunk[ROWS_A_PER_DPU_MAX * COLS_A_MAX]; -__mram_noinit double B_whole[COLS_A_MAX * COLS_B_MAX]; -__mram_noinit double C_chunk[ROWS_A_PER_DPU_MAX * COLS_B_MAX]; +__mram_noinit double A_chunk[TILE_SIZE * TILE_SIZE]; +__mram_noinit double B_whole[TILE_SIZE * TILE_SIZE]; +__mram_noinit double C_chunk[TILE_SIZE * TILE_SIZE]; __host dpu_args_t DPU_INPUT_ARGS; diff --git a/src/host/dpu_host.c b/src/host/dpu_host.c index e6a767b..e923e98 100644 --- a/src/host/dpu_host.c +++ b/src/host/dpu_host.c @@ -13,8 +13,6 @@ void free_dpus() void init_dpus() { if(!upmem_initialized) { - assert(TILE_SIZE / NUM_DPU <= ROWS_A_PER_DPU_MAX); - DPU_ASSERT(dpu_alloc(NUM_DPU, NULL, &dpus)); DPU_ASSERT(dpu_load(dpus, DPU_BINARY_PATH, NULL)); @@ -70,10 +68,6 @@ void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows void process_tile_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b) { - assert(rows_a <= ROWS_A_MAX); - assert(cols_a <= COLS_A_MAX); - assert(cols_b <= COLS_B_MAX); - unsigned int bytes_b = cols_a * cols_b * sizeof(double); DPU_ASSERT(dpu_broadcast_to(dpus, "B_whole", 0, B, bytes_b, DPU_XFER_DEFAULT)); From c1d1b3e30615dfb317a058478c89b837ecffb68d Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Sat, 22 Nov 2025 16:19:10 +0100 Subject: [PATCH 22/32] Move from double to single-precision float --- include/mlp.h | 36 ++++++++++++------------- include/upmem.h | 4 +-- src/dpu/dpu_program.c | 8 +++--- src/host/accumulate_layer_gradients.c | 6 ++--- src/host/activation.c | 8 +++--- src/host/apply_gradients.c | 8 +++--- src/host/dpu_host.c | 24 ++++++++--------- src/host/drand.c | 4 +-- src/host/get_delta.c | 10 +++---- src/host/get_total_loss.c | 8 +++--- src/host/get_y.c | 6 ++--- src/host/get_z.c | 6 ++--- src/host/init_layer.c | 4 +-- src/host/init_neuron.c | 10 +++---- src/host/matrix.c | 8 +++--- src/host/mlp.c | 36 ++++++++++++------------- src/host/sse.c | 8 +++--- src/host/utils.c | 10 +++---- tests/test_accumulate_layer_gradients.c | 4 +-- tests/test_activation.c | 8 +++--- tests/test_drand.c | 2 +- tests/test_get_delta.c | 8 +++--- tests/test_get_y.c | 12 ++++----- tests/test_get_z.c | 10 +++---- tests/test_init_layer.c | 8 +++--- tests/test_init_network.c | 2 +- tests/test_init_neuron.c | 4 +-- tests/test_matrix.c | 28 +++++++++---------- tests/test_sse.c | 6 ++--- 29 files changed, 148 insertions(+), 148 deletions(-) diff --git a/include/mlp.h b/include/mlp.h index 9ef6cdf..081cea9 100644 --- a/include/mlp.h +++ b/include/mlp.h @@ -35,13 +35,13 @@ extern unsigned int rseed; typedef struct { int num_weights; - double *w, *lw; - double *batch_dw; + float *w, *lw; + float *batch_dw; } NEURON; typedef struct { int num_neurons; - double *inputs, *deltas; + float *inputs, *deltas; NEURON *n; } LAYER; @@ -50,23 +50,23 @@ typedef struct { LAYER *l; } NETWORK; -void accumulate_layer_gradients(LAYER *l, int batch_size, double learning_rate); +void accumulate_layer_gradients(LAYER *l, int batch_size, float learning_rate); void apply_gradients(NETWORK *n, int batch_size); -double drand(); -double get_activation(double x); -double get_activation_derivative(double x); -double *get_delta(NETWORK *n, double *samples, double *ideal, int layer_index); -double *get_total_loss(NETWORK *n, double **samples, double **ideal, int nsamples); -double *get_y(NETWORK *n, int layer_index, double *sample); -double *get_z(NETWORK *n, int layer_index, double *sample); +float drand(); +float get_activation(float x); +float get_activation_derivative(float x); +float *get_delta(NETWORK *n, float *samples, float *ideal, int layer_index); +float *get_total_loss(NETWORK *n, float **samples, float **ideal, int nsamples); +float *get_y(NETWORK *n, int layer_index, float *sample); +float *get_z(NETWORK *n, int layer_index, float *sample); LAYER *init_layer(int num_neurons, int num_weights_per_neuron, int batch_size); NETWORK *init_network(int num_inputs, int num_layers, int *num_inputs_per_layer, int batch_size); NEURON *init_neuron(int num_weights); -void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b); -void multiply_matrix_naive(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b); +void multiply_matrix(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b); +void multiply_matrix_naive(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b); uint8_t **read_image_data(const char *filename, int *num_rows, const int num_cols); -double sse(double *real, double *ideal, int length); -void transpose_matrix(const double *A, double *C, int rows, int cols); +float sse(float *real, float *ideal, int length); +void transpose_matrix(const float *A, float *C, int rows, int cols); // // utility functions @@ -76,10 +76,10 @@ void free_layer(LAYER *l); void free_network(NETWORK *n); void free_neuron(NEURON *n); -void free_double_matrix(double **addr, int nrows); +void free_float_matrix(float **addr, int nrows); void free_uint8_matrix(uint8_t **addr, int nrows); -void print_double_matrix(double **addr, int nrows, int ncols); -void print_double_vector(double *addr, int nrows); +void print_float_matrix(float **addr, int nrows, int ncols); +void print_float_vector(float *addr, int nrows); #endif diff --git a/include/upmem.h b/include/upmem.h index 24df7a8..726ee31 100644 --- a/include/upmem.h +++ b/include/upmem.h @@ -25,7 +25,7 @@ extern int upmem_initialized; void free_dpus(); void init_dpus(); -void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b); -void process_tile_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b); +void multiply_matrix_upmem(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b); +void process_tile_upmem(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b); #endif diff --git a/src/dpu/dpu_program.c b/src/dpu/dpu_program.c index 32697e3..1155bac 100644 --- a/src/dpu/dpu_program.c +++ b/src/dpu/dpu_program.c @@ -2,9 +2,9 @@ #include #include "upmem.h" -__mram_noinit double A_chunk[TILE_SIZE * TILE_SIZE]; -__mram_noinit double B_whole[TILE_SIZE * TILE_SIZE]; -__mram_noinit double C_chunk[TILE_SIZE * TILE_SIZE]; +__mram_noinit float A_chunk[TILE_SIZE * TILE_SIZE]; +__mram_noinit float B_whole[TILE_SIZE * TILE_SIZE]; +__mram_noinit float C_chunk[TILE_SIZE * TILE_SIZE]; __host dpu_args_t DPU_INPUT_ARGS; @@ -23,7 +23,7 @@ int main() for(int i=row_start; i<(row_start+chunk); ++i) { for(int j=0; jnum_neurons; int num_weights = l->n->num_weights; - double *gradient = (double *) malloc (num_neurons * num_weights * sizeof(double)); + float *gradient = (float *) malloc (num_neurons * num_weights * sizeof(float)); if(!gradient) { return; } - double *deltas_T = (double*) malloc (num_neurons * batch_size * sizeof(double)); + float *deltas_T = (float*) malloc (num_neurons * batch_size * sizeof(float)); if(!deltas_T) { free(gradient); return; diff --git a/src/host/activation.c b/src/host/activation.c index eeaaee7..5345ec6 100644 --- a/src/host/activation.c +++ b/src/host/activation.c @@ -1,11 +1,11 @@ #include "mlp.h" -double get_activation(double x) +float get_activation(float x) { - return tanh(x); + return tanhf(x); } -double get_activation_derivative(double x) +float get_activation_derivative(float x) { - return 1.0 / pow(cosh(x), 2); + return 1.0 / powf(coshf(x), 2); } \ No newline at end of file diff --git a/src/host/apply_gradients.c b/src/host/apply_gradients.c index 4bc143b..ede95e7 100644 --- a/src/host/apply_gradients.c +++ b/src/host/apply_gradients.c @@ -15,11 +15,11 @@ void apply_gradients(NETWORK *n, int batch_size) for(int k=0; knum_weights; k++) // do the following for all weights "k" of said neuron: { - double previous_weight_update = np->w[k] - np->lw[k]; - double momentum_term = MOMENTUM * previous_weight_update; - double gradient_term = np->batch_dw[k] / (double) batch_size; + float previous_weight_update = np->w[k] - np->lw[k]; + float momentum_term = MOMENTUM * previous_weight_update; + float gradient_term = np->batch_dw[k] / (float) batch_size; - double old_weight = np->w[k]; + float old_weight = np->w[k]; np->lw[k] = old_weight; np->w[k] = old_weight + gradient_term + momentum_term; diff --git a/src/host/dpu_host.c b/src/host/dpu_host.c index e923e98..ac203a0 100644 --- a/src/host/dpu_host.c +++ b/src/host/dpu_host.c @@ -20,11 +20,11 @@ void init_dpus() } } -void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b) +void multiply_matrix_upmem(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b) { - double tileA[TILE_SIZE][TILE_SIZE]; - double tileB[TILE_SIZE][TILE_SIZE]; - double tileC[TILE_SIZE][TILE_SIZE]; + float tileA[TILE_SIZE][TILE_SIZE]; + float tileB[TILE_SIZE][TILE_SIZE]; + float tileC[TILE_SIZE][TILE_SIZE]; for(int i=0; il+layer_index)->num_neurons; - double *d = (double*) malloc (sizeof(double) * layer_size); + float *d = (float*) malloc (sizeof(float) * layer_size); if(!d) { fprintf(stderr, "Error 10010\n"); return NULL; } - double *z = get_z(n, layer_index, sample); + float *z = get_z(n, layer_index, sample); if(!z) { fprintf(stderr, "Error 10011\n"); free(d); @@ -21,7 +21,7 @@ double *get_delta(NETWORK *n, double* sample, double* ideal, int layer_index) if(is_current_layer_last_layer) { - double *y = get_y(n, layer_index, sample); + float *y = get_y(n, layer_index, sample); if(!y) { fprintf(stderr, "Error 10012\n"); free(d); @@ -36,7 +36,7 @@ double *get_delta(NETWORK *n, double* sample, double* ideal, int layer_index) } else { - double *next_d = get_delta(n, sample, ideal, layer_index+1); + float *next_d = get_delta(n, sample, ideal, layer_index+1); if(!next_d) { fprintf(stderr, "Error 10013\n"); free(d); diff --git a/src/host/get_total_loss.c b/src/host/get_total_loss.c index 8bf7f2c..c386536 100644 --- a/src/host/get_total_loss.c +++ b/src/host/get_total_loss.c @@ -1,8 +1,8 @@ #include "mlp.h" -double *get_total_loss(NETWORK *n, double **samples, double **ideal, int nsamples) +float *get_total_loss(NETWORK *n, float **samples, float **ideal, int nsamples) { - double *total_loss = (double*) malloc (sizeof(double)); + float *total_loss = (float*) malloc (sizeof(float)); if(!total_loss) { fprintf(stderr, "Error 10007\n"); return NULL; @@ -13,13 +13,13 @@ double *get_total_loss(NETWORK *n, double **samples, double **ideal, int nsample LAYER *last_layer = n->l+(n->num_layers-1); for(int i=0; inum_layers-1, samples[i]); + float *y = get_y(n, n->num_layers-1, samples[i]); if(!y) { fprintf(stderr, "Error 10008\n"); free(total_loss); return NULL; } - *total_loss += sse(y, ideal[i], last_layer->num_neurons) / (double)nsamples; + *total_loss += sse(y, ideal[i], last_layer->num_neurons) / (float)nsamples; free(y); } diff --git a/src/host/get_y.c b/src/host/get_y.c index 3e5b70e..5931c62 100644 --- a/src/host/get_y.c +++ b/src/host/get_y.c @@ -2,7 +2,7 @@ // preactivation -> get_y -> activation -double *get_y(NETWORK *n, int layer_index, double *sample) +float *get_y(NETWORK *n, int layer_index, float *sample) { LAYER *current_layer = n->l+layer_index; int is_current_layer_last_layer = (n->num_layers == layer_index + 1); @@ -11,9 +11,9 @@ double *get_y(NETWORK *n, int layer_index, double *sample) if(!is_current_layer_last_layer) // add bias node y_size++; - double *z = get_z(n, layer_index, sample); + float *z = get_z(n, layer_index, sample); - double *y = (double *) malloc (sizeof(double)*y_size); + float *y = (float *) malloc (sizeof(float)*y_size); if(!y) { fprintf(stderr, "Error 10006\n"); return NULL; diff --git a/src/host/get_z.c b/src/host/get_z.c index ad7a08d..466ee1a 100644 --- a/src/host/get_z.c +++ b/src/host/get_z.c @@ -2,20 +2,20 @@ // samples -> get_z -> preactivation -double *get_z(NETWORK *n, int layer_index, double *sample) +float *get_z(NETWORK *n, int layer_index, float *sample) { LAYER *current_layer = n->l+layer_index; int z_neuroncount = current_layer->num_neurons; int z_weightcount = current_layer->n->num_weights; int is_first_layer = layer_index == 0; - double *z = (double *) malloc (sizeof(double)* z_neuroncount); + float *z = (float *) malloc (sizeof(float)* z_neuroncount); if(!z) { fprintf(stderr, "Error 10005\n"); return NULL; } - double *z_prev = is_first_layer ? sample : get_y(n, layer_index-1, sample); + float *z_prev = is_first_layer ? sample : get_y(n, layer_index-1, sample); for(size_t i=0; inum_neurons = num_neurons; - l->inputs = (double*) malloc (batch_size * num_weights_per_neuron * sizeof(double)); + l->inputs = (float*) malloc (batch_size * num_weights_per_neuron * sizeof(float)); if(!l->inputs) { free(l); return NULL; } - l->deltas = (double*) malloc (batch_size * num_neurons * sizeof(double)); + l->deltas = (float*) malloc (batch_size * num_neurons * sizeof(float)); if(!l->deltas) { free(l->inputs); free(l); diff --git a/src/host/init_neuron.c b/src/host/init_neuron.c index b5506f3..450677e 100644 --- a/src/host/init_neuron.c +++ b/src/host/init_neuron.c @@ -9,20 +9,20 @@ NEURON *init_neuron(int num_weights) n->num_weights = num_weights; - n->w = (double *) malloc (sizeof(double) * n->num_weights); + n->w = (float *) malloc (sizeof(float) * n->num_weights); if(!n->w) { free(n); return NULL; } - n->lw = (double *) malloc (sizeof(double) * n->num_weights); + n->lw = (float *) malloc (sizeof(float) * n->num_weights); if(!n->lw) { free(n->w); free(n); return NULL; } - n->batch_dw = (double *) malloc (sizeof(double) * n->num_weights); + n->batch_dw = (float *) malloc (sizeof(float) * n->num_weights); if(!n->batch_dw) { free(n->lw); free(n->w); @@ -30,11 +30,11 @@ NEURON *init_neuron(int num_weights) return NULL; } - double limit = 1.0/sqrt((double) num_weights); + float limit = 1.0/sqrt((float) num_weights); for(int i=0; iw[i] = (rand_unit * 2.0 - 1.0) * limit; n->lw[i] = n->w[i]; n->batch_dw[i] = 0; diff --git a/src/host/matrix.c b/src/host/matrix.c index 017d014..967ba10 100644 --- a/src/host/matrix.c +++ b/src/host/matrix.c @@ -1,7 +1,7 @@ #include "mlp.h" #include "upmem.h" -void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b) +void multiply_matrix(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b) { #ifdef UPMEM init_dpus(); @@ -11,11 +11,11 @@ void multiply_matrix(const double *A, const double *B, double *C, int rows_a, in #endif } -void multiply_matrix_naive(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b) +void multiply_matrix_naive(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b) { for(int i=0; inum_layers-1; j>=0; --j) { LAYER *lp = n->l+j; // ptr to layer j of network n - double *d = get_delta(n, samples[i], labels[i], j); + float *d = get_delta(n, samples[i], labels[i], j); - memcpy(lp->deltas+batch_ctr*lp->num_neurons, d, lp->num_neurons * sizeof(double)); + memcpy(lp->deltas+batch_ctr*lp->num_neurons, d, lp->num_neurons * sizeof(float)); - double *py = j ? get_y(n, j-1, samples[i]) : NULL; + float *py = j ? get_y(n, j-1, samples[i]) : NULL; if(j && !py) { fprintf(stderr, "Error 10009\n"); return 1; } - memcpy(lp->inputs+batch_ctr*lp->n->num_weights, (j ? py : samples[i]), lp->n->num_weights * sizeof(double)); + memcpy(lp->inputs+batch_ctr*lp->n->num_weights, (j ? py : samples[i]), lp->n->num_weights * sizeof(float)); free(d); if(j) free(py); @@ -106,18 +106,18 @@ int main() apply_gradients(n, actual_batch_size); } - double *loss_new = get_total_loss(n, samples, labels, NUM_TRAIN_SAMPLES); + float *loss_new = get_total_loss(n, samples, labels, NUM_TRAIN_SAMPLES); if(!loss_new) { fprintf(stderr, "Error 10015\n"); return 1; } - double loss_delta = fabs(*loss_new - *loss_prev); + float loss_delta = fabs(*loss_new - *loss_prev); epoch++; #ifdef VERBOSE - printf("Epoch %-3d --- Lost Delta = %.9lf --- Final Loss = %.6lf\n", epoch, loss_delta, *loss_new); + printf("Epoch %-3d --- Lost Delta = %.9f --- Final Loss = %.6f\n", epoch, loss_delta, *loss_new); #endif free(loss_prev); @@ -135,7 +135,7 @@ int main() LAYER *lp = n->l+i; // ptr to i-th layer of the network n for(int j=0; jnum_neurons; j++) { NEURON *np = lp->n+j; // ptr to j-th neuron of the i-th layer of network n - print_double_vector(np->w, np->num_weights); + print_float_vector(np->w, np->num_weights); printf("\n"); } printf("\n\n"); @@ -148,8 +148,8 @@ int main() } // memory cleanup before termination - free_double_matrix(samples, NUM_TRAIN_SAMPLES); - free_double_matrix(labels, NUM_TRAIN_SAMPLES); + free_float_matrix(samples, NUM_TRAIN_SAMPLES); + free_float_matrix(labels, NUM_TRAIN_SAMPLES); free_network(n); return 0; diff --git a/src/host/sse.c b/src/host/sse.c index ab3477b..cf58db9 100644 --- a/src/host/sse.c +++ b/src/host/sse.c @@ -1,12 +1,12 @@ #include "mlp.h" -double sse(double *real, double *ideal, int length) +float sse(float *real, float *ideal, int length) { - double sse = 0.0; // Sum of squared errors + float sse = 0.0; // Sum of squared errors for(size_t i=0; inum_weights = 0; } -void free_double_matrix(double **addr, int nrows) +void free_float_matrix(float **addr, int nrows) { if(!addr) return; @@ -68,19 +68,19 @@ void free_uint8_matrix(uint8_t **addr, int nrows) free(addr); } -void print_double_matrix(double **addr, int nrows, int ncols) +void print_float_matrix(float **addr, int nrows, int ncols) { for(size_t i=0; iinputs[i] = ((double) rand() / (double) RAND_MAX) * 20; + first_layer->inputs[i] = ((float) rand() / (float) RAND_MAX) * 20; // deltas is a 1x4 identity matrix for(int i=0; i<1*4; i++) first_layer->deltas[i] = 1.0; - double batch_dw_ideal[4][5] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + float batch_dw_ideal[4][5] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; for(int i=0; i<4; i++) for(int j=0; j<5; j++) diff --git a/tests/test_activation.c b/tests/test_activation.c index 8998218..2d120bb 100644 --- a/tests/test_activation.c +++ b/tests/test_activation.c @@ -1,12 +1,12 @@ #include "mlp.h" #include "test.h" -int test_activation(double x) +int test_activation(float x) { - double activation_result = get_activation(x); - double activation_derivative_result = get_activation_derivative(x); + float activation_result = get_activation(x); + float activation_derivative_result = get_activation_derivative(x); - double expected_activation_derivative = 1 - pow(activation_result, 2); + float expected_activation_derivative = 1 - powf(activation_result, 2); if(abs(activation_derivative_result - expected_activation_derivative) < 1e-5) return 1; diff --git a/tests/test_drand.c b/tests/test_drand.c index 1411e08..2771bb1 100644 --- a/tests/test_drand.c +++ b/tests/test_drand.c @@ -7,7 +7,7 @@ int test_drand() for(int i=0; i<10; i++) { - double test_value = drand(); + float test_value = drand(); test_pass_fail &= (test_value >= 0.0) && (test_value <= 1.0); } diff --git a/tests/test_get_delta.c b/tests/test_get_delta.c index b8a06c6..068f347 100644 --- a/tests/test_get_delta.c +++ b/tests/test_get_delta.c @@ -6,8 +6,8 @@ int test_get_delta() int test_pass_fail = 1; int num_neurons_per_layers[] = {3, 3}; - double samples[] = {1, 1, 1, 1}; - double ideals[] = {3, 3, 3, 3}; + float samples[] = {1, 1, 1, 1}; + float ideals[] = {3, 3, 3, 3}; NETWORK *n = init_network(3, 2, num_neurons_per_layers, BATCH_SIZE); @@ -28,7 +28,7 @@ int test_get_delta() // test last layer delta - double *d_last_layer = get_delta(n, samples, ideals, 1); + float *d_last_layer = get_delta(n, samples, ideals, 1); for(int i=0; i<3; i++) { @@ -37,7 +37,7 @@ int test_get_delta() // test before-last layer delta - double *d_first_layer = get_delta(n, samples, ideals, 0); + float *d_first_layer = get_delta(n, samples, ideals, 0); for(int i=0; i<3; i++) { diff --git a/tests/test_get_y.c b/tests/test_get_y.c index 4274682..de7cb02 100644 --- a/tests/test_get_y.c +++ b/tests/test_get_y.c @@ -4,7 +4,7 @@ int test_get_y() { int num_neurons_per_layers[] = {3, 3}; - double samples[] = {1, 1, 1, 1}; + float samples[] = {1, 1, 1, 1}; NETWORK *n = init_network(3, 2, num_neurons_per_layers, BATCH_SIZE); @@ -23,12 +23,12 @@ int test_get_y() n->l[0].n[2].w[2] = 0.0; n->l[0].n[2].w[3] = 0.0; - double *y = get_y(n, 0, samples); - double *z = get_z(n, 0, samples); + float *y = get_y(n, 0, samples); + float *z = get_z(n, 0, samples); - // printf("y[0] == %.2lf\n", y[0]); - // printf("y[1] == %.2lf\n", y[1]); - // printf("y[2] == %.2lf\n", y[2]); + // printf("y[0] == %.2f\n", y[0]); + // printf("y[1] == %.2f\n", y[1]); + // printf("y[2] == %.2f\n", y[2]); int test_pass_fail = (y[0] == 1) && (y[1] == get_activation(z[0])) diff --git a/tests/test_get_z.c b/tests/test_get_z.c index be921c6..7244651 100644 --- a/tests/test_get_z.c +++ b/tests/test_get_z.c @@ -4,7 +4,7 @@ int test_get_z() { int num_neurons_per_layers[] = {3, 3}; - double samples[] = {1, 1, 1, 1}; + float samples[] = {1, 1, 1, 1}; NETWORK *n = init_network(3, 2, num_neurons_per_layers, BATCH_SIZE); @@ -23,11 +23,11 @@ int test_get_z() n->l[0].n[2].w[2] = 0.0; n->l[0].n[2].w[3] = 0.0; - double *z = get_z(n, 0, samples); + float *z = get_z(n, 0, samples); - // printf("z[0] == %.2lf\n", z[0]); - // printf("z[1] == %.2lf\n", z[1]); - // printf("z[2] == %.2lf\n", z[2]); + // printf("z[0] == %.2f\n", z[0]); + // printf("z[1] == %.2f\n", z[1]); + // printf("z[2] == %.2f\n", z[2]); int test_pass_fail = (z[0] == 2) && (z[1] == 6) && (z[2] == -1); diff --git a/tests/test_init_layer.c b/tests/test_init_layer.c index 13b7ae6..761f331 100644 --- a/tests/test_init_layer.c +++ b/tests/test_init_layer.c @@ -6,10 +6,10 @@ int test_init_layer() LAYER *l = init_layer(3, 4, BATCH_SIZE); // printf("%d\n", l->num_neurons); - // printf("%lf\n", l->n[0].w[0]); - // printf("%lf\n", l->n[1].w[0]); - // printf("%lf\n", l->n[2].w[0]); - // printf("%lf\n", l->n[0].lw[0]); + // printf("%f\n", l->n[0].w[0]); + // printf("%f\n", l->n[1].w[0]); + // printf("%f\n", l->n[2].w[0]); + // printf("%f\n", l->n[0].lw[0]); // printf("%d\n", l->n[0].num_weights); // printf("%d\n", l->n[1].num_weights); // printf("%d\n", l->n[2].num_weights); diff --git a/tests/test_init_network.c b/tests/test_init_network.c index 375565f..2e5603b 100644 --- a/tests/test_init_network.c +++ b/tests/test_init_network.c @@ -12,7 +12,7 @@ int test_init_network() // printf("%d\n", n->l[1].num_neurons); // printf("%d\n", n->l[2].num_neurons); // printf("%d\n", n->l[0].n[0].num_weights); - // printf("%lf\n", n->l[0].n[0].lw[0]); + // printf("%f\n", n->l[0].n[0].lw[0]); // printf("%d\n", n->l[1].n[0].num_weights); // printf("%d\n", n->l[2].n[0].num_weights); diff --git a/tests/test_init_neuron.c b/tests/test_init_neuron.c index 61d0232..486548c 100644 --- a/tests/test_init_neuron.c +++ b/tests/test_init_neuron.c @@ -6,8 +6,8 @@ int test_init_neuron() NEURON *n = init_neuron(2); // printf("%d\n", n->num_weights); - // printf("%lf\n", n->w[0]); - // printf("%lf\n", n->lw[0]); + // printf("%f\n", n->w[0]); + // printf("%f\n", n->lw[0]); return (n->num_weights == 2) && (n->w[0] <= 1) && (n->w[0] >= -1) && (n->lw[0] == n->w[0]); } diff --git a/tests/test_matrix.c b/tests/test_matrix.c index 699646e..37e9308 100644 --- a/tests/test_matrix.c +++ b/tests/test_matrix.c @@ -6,23 +6,23 @@ int test_multiply_matrix() { int test_result_pass_fail = 1; - double matrixA[2*3] = {1.0, 2.0, 3.0, + float matrixA[2*3] = {1.0, 2.0, 3.0, 0.0, 5.0, 6.0}; - double matrixB[3*2] = {2.0, 6.0, + float matrixB[3*2] = {2.0, 6.0, 3.0, 3.0, 4.0, 0.0}; - // result matrices (initialized with random double values [0.0, 20.0]) - double matrixC[2*2]; - double matrixD[2*2]; + // result matrices (initialized with random float values [0.0, 20.0]) + float matrixC[2*2]; + float matrixD[2*2]; for(int i=0; i<2*2; i++) { - matrixC[i] = ((double)rand() / (double)RAND_MAX) * 20; - matrixD[i] = ((double)rand() / (double)RAND_MAX) * 20; + matrixC[i] = ((float)rand() / (float)RAND_MAX) * 20; + matrixD[i] = ((float)rand() / (float)RAND_MAX) * 20; } // ideal result - double matrixR[2*2] = {20.0, 12.0, + float matrixR[2*2] = {20.0, 12.0, 39.0, 15.0}; multiply_matrix_naive(matrixA, matrixB, matrixC, 2, 3, 2); @@ -32,7 +32,7 @@ int test_multiply_matrix() free_dpus(); for(int i=0; i<2*2; i++) { - printf("%lf ", matrixC[i]); + printf("%f ", matrixC[i]); } for(int i=0; i<2*2; i++) { @@ -47,17 +47,17 @@ int test_transpose_matrix() { int test_result_pass_fail = 1; - double matrixA[2*3] = {1.0, 2.0, 3.0, + float matrixA[2*3] = {1.0, 2.0, 3.0, 0.0, 5.0, 6.0}; - // result matrix (initialized with random double values [0.0, 20.0]) - double matrixT[3*2]; + // result matrix (initialized with random float values [0.0, 20.0]) + float matrixT[3*2]; for(int i=0; i<3*2; i++) { - matrixT[i] = ((double)rand() / (double)RAND_MAX) * 20; + matrixT[i] = ((float)rand() / (float)RAND_MAX) * 20; } // ideal result - double matrixR[3*2] = {1.0, 0.0, + float matrixR[3*2] = {1.0, 0.0, 2.0, 5.0, 3.0, 6.0}; diff --git a/tests/test_sse.c b/tests/test_sse.c index 8f660c0..9cacee5 100644 --- a/tests/test_sse.c +++ b/tests/test_sse.c @@ -3,11 +3,11 @@ int test_sse() { - double real[] = {3, 4, 4, 4}; - double ideal[] = {4, 4, 4, 4}; + float real[] = {3, 4, 4, 4}; + float ideal[] = {4, 4, 4, 4}; int test_pass_fail = 1; - double sse_result = sse(real, ideal, 4); + float sse_result = sse(real, ideal, 4); test_pass_fail = test_pass_fail && (sse_result == 1); From 207ada9044692b122e0461bb8c1707adc30937cc Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Sat, 22 Nov 2025 16:30:29 +0100 Subject: [PATCH 23/32] Introduce macro `TEST_FLOAT_EQ` & adapt unit tests --- include/test.h | 4 ++++ tests/test_accumulate_layer_gradients.c | 2 +- tests/test_get_delta.c | 8 ++++++-- tests/test_get_y.c | 12 ++++++------ tests/test_get_z.c | 2 +- tests/test_sse.c | 8 ++++---- 6 files changed, 22 insertions(+), 14 deletions(-) diff --git a/include/test.h b/include/test.h index 5cf9797..880f862 100644 --- a/include/test.h +++ b/include/test.h @@ -11,4 +11,8 @@ if(test_result == 0) \ printf("PASS\n"); \ return 0; \ +#define TEST_FLOAT_EQ(v1, v2, eps) (fabsf((v1) - (v2)) < (eps)) + +#define EPS_TEST 1e-5 + #endif \ No newline at end of file diff --git a/tests/test_accumulate_layer_gradients.c b/tests/test_accumulate_layer_gradients.c index 77cfc4e..87b7a92 100644 --- a/tests/test_accumulate_layer_gradients.c +++ b/tests/test_accumulate_layer_gradients.c @@ -29,7 +29,7 @@ int test_accumulate_layer_gradients() for(int i=0; i<4; i++) for(int j=0; j<5; j++) - test_pass_fail &= batch_dw_ideal[i][j] == first_layer->n[i].batch_dw[j]; + test_pass_fail &= TEST_FLOAT_EQ(batch_dw_ideal[i][j], first_layer->n[i].batch_dw[j], EPS_TEST); return test_pass_fail; } diff --git a/tests/test_get_delta.c b/tests/test_get_delta.c index 068f347..97e63c1 100644 --- a/tests/test_get_delta.c +++ b/tests/test_get_delta.c @@ -32,7 +32,9 @@ int test_get_delta() for(int i=0; i<3; i++) { - test_pass_fail &= (d_last_layer[i] == (ideals[i] - get_y(n, 1, samples)[i]) * get_activation_derivative(get_z(n, 1, samples)[i])); + test_pass_fail &= TEST_FLOAT_EQ(d_last_layer[i], + (ideals[i] - get_y(n, 1, samples)[i]) * get_activation_derivative(get_z(n, 1, samples)[i]), + EPS_TEST); } // test before-last layer delta @@ -41,7 +43,9 @@ int test_get_delta() for(int i=0; i<3; i++) { - test_pass_fail &= (d_first_layer[i] == (d_last_layer[0] + d_last_layer[1] + d_last_layer[2]) * get_activation_derivative(get_z(n, 0, samples)[i])); + test_pass_fail &= TEST_FLOAT_EQ(d_first_layer[i], + (d_last_layer[0] + d_last_layer[1] + d_last_layer[2]) * get_activation_derivative(get_z(n, 0, samples)[i]), + EPS_TEST); } return test_pass_fail; diff --git a/tests/test_get_y.c b/tests/test_get_y.c index de7cb02..30206bd 100644 --- a/tests/test_get_y.c +++ b/tests/test_get_y.c @@ -30,17 +30,17 @@ int test_get_y() // printf("y[1] == %.2f\n", y[1]); // printf("y[2] == %.2f\n", y[2]); - int test_pass_fail = (y[0] == 1) - && (y[1] == get_activation(z[0])) - && (y[2] == get_activation(z[1])); + int test_pass_fail = TEST_FLOAT_EQ(y[0], 1, EPS_TEST) + && TEST_FLOAT_EQ(y[1], get_activation(z[0]), EPS_TEST) + && TEST_FLOAT_EQ(y[2], get_activation(z[1]), EPS_TEST); y = get_y(n, 1, samples); z = get_z(n, 1, samples); test_pass_fail = test_pass_fail - && (y[0] == get_activation(z[0])) - && (y[1] == get_activation(z[1])) - && (y[2] == get_activation(z[2])); + && TEST_FLOAT_EQ(y[0], get_activation(z[0]), EPS_TEST) + && TEST_FLOAT_EQ(y[1], get_activation(z[1]), EPS_TEST) + && TEST_FLOAT_EQ(y[2], get_activation(z[2]), EPS_TEST); return test_pass_fail; } diff --git a/tests/test_get_z.c b/tests/test_get_z.c index 7244651..4367604 100644 --- a/tests/test_get_z.c +++ b/tests/test_get_z.c @@ -29,7 +29,7 @@ int test_get_z() // printf("z[1] == %.2f\n", z[1]); // printf("z[2] == %.2f\n", z[2]); - int test_pass_fail = (z[0] == 2) && (z[1] == 6) && (z[2] == -1); + int test_pass_fail = TEST_FLOAT_EQ(z[0], 2, EPS_TEST) && TEST_FLOAT_EQ(z[1], 6, EPS_TEST) && TEST_FLOAT_EQ(z[2], -1, EPS_TEST); return test_pass_fail; } diff --git a/tests/test_sse.c b/tests/test_sse.c index 9cacee5..732258a 100644 --- a/tests/test_sse.c +++ b/tests/test_sse.c @@ -9,20 +9,20 @@ int test_sse() float sse_result = sse(real, ideal, 4); - test_pass_fail = test_pass_fail && (sse_result == 1); + test_pass_fail &= TEST_FLOAT_EQ(sse_result, 1, EPS_TEST); real[0] = 4; sse_result = sse(real, ideal, 4); - test_pass_fail = test_pass_fail && (sse_result == 0); + test_pass_fail &= TEST_FLOAT_EQ(sse_result, 0, EPS_TEST); real[0] = 6; sse_result = sse(real, ideal, 4); - test_pass_fail = test_pass_fail && (sse_result == 4); + test_pass_fail &= TEST_FLOAT_EQ(sse_result, 4, EPS_TEST); real[0] = 6; real[1] = 2; sse_result = sse(real, ideal, 4); - test_pass_fail = test_pass_fail && (sse_result == 8); + test_pass_fail &= TEST_FLOAT_EQ(sse_result, 8, EPS_TEST); return test_pass_fail; } From a7277ba384881ab9c10df260fd24bd711b3a2f59 Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Sun, 23 Nov 2025 11:53:53 +0100 Subject: [PATCH 24/32] Update `TILE_SIZE` and `NUM_DPU` --- Makefile | 2 +- include/upmem.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 5c9f6da..b9e2626 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ DPU_UPMEM_CLANG = dpu-upmem-dpurte-clang -DPU_UPMEM_CFLAGS += -DNR_TASKLETS=4 +DPU_UPMEM_CFLAGS += -DNR_TASKLETS=16 CFLAGS += -std=c99 -Iinclude -D_GNU_SOURCE -DVERBOSE -DDEBUG -DBATCH_SIZE=2 -DMAX_EPOCH=2 -DNUM_TRAIN_SAMPLES=8 FILES_TO_DELETE = build/ diff --git a/include/upmem.h b/include/upmem.h index 726ee31..c1fd7ee 100644 --- a/include/upmem.h +++ b/include/upmem.h @@ -8,11 +8,11 @@ #endif #ifndef NUM_DPU -#define NUM_DPU 8 +#define NUM_DPU 32 #endif #ifndef TILE_SIZE -#define TILE_SIZE 32 +#define TILE_SIZE 512 #endif typedef struct { From 6ddb943c2c934fb1a1d146171405cb01f5610fda Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Sun, 23 Nov 2025 11:54:22 +0100 Subject: [PATCH 25/32] Fix typo in CMake configuration file --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 26c5fc9..7f33b08 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,7 @@ add_custom_target(build_dpu_program ALL ) add_compile_definitions( - # NUM_CPU=1 Important: This macro override was commented because it does not apply to the dpu-upmem-dpurte-clang execution above; and therefore causes mismatch between + # NUM_DPU=1 Important: This macro override was commented because it does not apply to the dpu-upmem-dpurte-clang execution above; and therefore causes mismatch between # dpu_program.c and the rest. So this file should avoid modifying dimensions set through macros in aforementioned header files. DPU_BINARY_PATH=\"./dpu_program\" ) From e2cb79322c959fb1226c16a6140ea2cfafade619 Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Sun, 23 Nov 2025 12:01:17 +0100 Subject: [PATCH 26/32] Implement various improvements in top Makefile --- Makefile | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index b9e2626..5ee454a 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,14 @@ DPU_UPMEM_CLANG = dpu-upmem-dpurte-clang DPU_UPMEM_CFLAGS += -DNR_TASKLETS=16 -CFLAGS += -std=c99 -Iinclude -D_GNU_SOURCE -DVERBOSE -DDEBUG -DBATCH_SIZE=2 -DMAX_EPOCH=2 -DNUM_TRAIN_SAMPLES=8 -FILES_TO_DELETE = build/ + +BATCH_SIZE ?= 20 +MAX_EPOCH ?= 10 +NUM_TRAIN_SAMPLES ?= 200 + +CFLAGS += -std=c99 -Iinclude -D_GNU_SOURCE -DVERBOSE -DDEBUG +CFLAGS += -DBATCH_SIZE=$(BATCH_SIZE) -DMAX_EPOCH=$(MAX_EPOCH) -DNUM_TRAIN_SAMPLES=$(NUM_TRAIN_SAMPLES) + +BUILD_DIR = build/ UPMEM ?= 1 ifeq ($(UPMEM), 1) @@ -13,10 +20,10 @@ ifeq ($(SAN), 1) CFLAGS += -fsanitize=address,undefined,leak -fno-omit-frame-pointer -g endif -all: - mkdir build; \ +all: clean + mkdir $(BUILD_DIR); \ $(DPU_UPMEM_CLANG) $(DPU_UPMEM_CFLAGS) -Iinclude -o build/dpu_program src/dpu/dpu_program.c; \ gcc src/host/*.c $(CFLAGS) -o build/mlp -lm `dpu-pkg-config --cflags --libs dpu` clean: - rm -rf $(FILES_TO_DELETE) + rm -rf $(BUILD_DIR) From 4035e75fbc42a0a582479fe4570cec0120460ba2 Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Sun, 23 Nov 2025 12:10:34 +0100 Subject: [PATCH 27/32] Add performance evaluation mode `EVAL` --- Makefile | 5 +++++ include/upmem.h | 2 ++ src/host/dpu_host.c | 8 +++++++- src/host/mlp.c | 10 +++++++++- 4 files changed, 23 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 5ee454a..e2f8548 100644 --- a/Makefile +++ b/Makefile @@ -20,6 +20,11 @@ ifeq ($(SAN), 1) CFLAGS += -fsanitize=address,undefined,leak -fno-omit-frame-pointer -g endif +EVAL ?= 0 +ifeq ($(EVAL), 1) + CFLAGS += -DEVAL +endif + all: clean mkdir $(BUILD_DIR); \ $(DPU_UPMEM_CLANG) $(DPU_UPMEM_CFLAGS) -Iinclude -o build/dpu_program src/dpu/dpu_program.c; \ diff --git a/include/upmem.h b/include/upmem.h index c1fd7ee..78bf601 100644 --- a/include/upmem.h +++ b/include/upmem.h @@ -15,6 +15,8 @@ #define TILE_SIZE 512 #endif +#define EVAL_DPU_CC 458000000 + typedef struct { uint32_t rows_a; uint32_t cols_a; diff --git a/src/host/dpu_host.c b/src/host/dpu_host.c index ac203a0..c09115f 100644 --- a/src/host/dpu_host.c +++ b/src/host/dpu_host.c @@ -1,5 +1,6 @@ #include #include +#include #include "upmem.h" struct dpu_set_t dpus, dpu; @@ -51,8 +52,13 @@ void multiply_matrix_upmem(const float *A, const float *B, float *C, int rows_a, } } } - + +#ifdef EVAL + unsigned long long start = __rdtsc(); + while(__rdtsc() - start < EVAL_DPU_CC); +#else process_tile_upmem(&tileA[0][0], &tileB[0][0], &tileC[0][0], TILE_SIZE, TILE_SIZE, TILE_SIZE); +#endif for(int row=0; row unsigned int rseed = 42; @@ -60,6 +61,10 @@ int main() int num_batches = (NUM_TRAIN_SAMPLES + BATCH_SIZE - 1) / BATCH_SIZE; +#ifdef EVAL + unsigned long long cc_start = __rdtsc(); +#endif + while(1) { float learning_rate_epoch = LEARNING_RATE * powf(DECAY_RATE, epoch); @@ -127,7 +132,10 @@ int main() break; } - printf("Training complete in %d epochs\n", epoch); +#ifdef EVAL + unsigned long long cc_end = __rdtsc(); + printf("Training complete | %lld cycles | %d epochs\n", cc_end-cc_start, epoch); +#endif #ifdef DEBUG printf("\n===== Weights =====\n\n"); From a098509c497bc4f0ac3b6249c8a0ec8fc82d2949 Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Sun, 23 Nov 2025 13:35:41 +0100 Subject: [PATCH 28/32] Add printout at program start for debugging --- src/host/mlp.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/host/mlp.c b/src/host/mlp.c index b492cd8..de56b3b 100644 --- a/src/host/mlp.c +++ b/src/host/mlp.c @@ -46,6 +46,12 @@ int main() free_uint8_matrix(sample_data, sample_rows); free_uint8_matrix(label_data, label_rows); +#ifdef UPMEM + printf("Run in UPMEM mode with BATCH_SIZE=%d, NUM_TRAIN_SAMPLES=%d, MAX_EPOCH=%d\n\n", BATCH_SIZE, NUM_TRAIN_SAMPLES, MAX_EPOCH); +#else + printf("Run in HOST mode with BATCH_SIZE=%d, NUM_TRAIN_SAMPLES=%d, MAX_EPOCH=%d\n\n", BATCH_SIZE, NUM_TRAIN_SAMPLES, MAX_EPOCH); +#endif + #ifdef DEBUG // print samples & labels to check if all is saved correctly into program memory printf("===== Samples =====\n\n"); From efd75f3f9825675a801bf01fbd5c33b1f61e8ad0 Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Sun, 23 Nov 2025 19:00:22 +0100 Subject: [PATCH 29/32] Update README --- README.md | 93 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 51 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 7deb945..ae97435 100644 --- a/README.md +++ b/README.md @@ -1,99 +1,108 @@ # UPMEM-MLP -UPMEM-MLP is an attempt at implementing a multilayer perceptron application in pure C and accelerating this application on the UPMEM platform. +UPMEM-MLP implements a multilayer perceptron training application in C and accelerates this application on the UPMEM platform. -[![Unit Tests](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/unit_tests.yaml/badge.svg)](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/unit_tests.yaml) [![Valgrind](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/valgrind.yaml/badge.svg)](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/valgrind.yaml) +[![Unit Tests](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/unit_tests.yaml/badge.svg)](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/unit_tests.yaml) [![Memory Leak Tests](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/memory_leak_tests.yaml/badge.svg)](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/memory_leak_tests.yaml) -## Requirements +## Prerequisites -- GCC or Clang - CMake 3.10 or higher +- GCC +- Python - UPMEM SDK -### Installing UPMEM SDK +
+Installing UPMEM SDK
-To set up the UPMEM SDK on your system: +1. Download UPMEM SDK tarball for your system from [this link](https://github.com/kagandikmen/upmem-sdk) -1. Download UPMEM SDK tarball for your system from [this link](https://sdk.upmem.com/) +> **NOTICE:** UPMEM SDK is no longer downloadable on UPMEM's official SDK [Downloads](https://sdk.upmem.com) page. 2. Extract its content and (preferably) move it to a better place like `/usr/local/bin/` -3. Add the shell script `upmem_env.sh`, which sets necessary environment variables, to be sourced into your `.bashrc` as in: +3. Add the shell script `upmem_env.sh`, which sets necessary environment variables, to be sourced into your `.bashrc`: ```bash -source /usr/local/bin/upmem-sdk/upmem_env.sh > /dev/null +source /usr/local/bin/upmem-sdk/upmem_env.sh simulator > /dev/null ``` 4. Restart your shell session for the changes to become effective -5. Test your setup using: +5. Test your setup: ```bash which dpu-lldb ``` +--- +
-which should, if correctly installed, return the path to the LLDB Debugger binary of UPMEM SDK +## Getting Started -## Running the Unit Tests - -To run the CMake test flow: +1. Clone this repository and navigate inside it: ```bash -mkdir build -cd build -cmake .. -make -make test +git clone https://github.com/OpenHardware-Initiative/UPMEM-MLP.git +cd UPMEM-MLP ``` -## Compiling the Multilayer Perceptron Natively - -To natively run the C multilayer perceptron on your system: - -1. Create a Python virtual environment (optional, but recommended) and install requirements: +2. **(Optional, but recommended)** Create a Python virtual environment: ```bash python3 -m venv venv source venv/bin/activate +``` + +3. Install Python requirements: + +```bash pip install -r requirements.txt ``` -2. Extract training samples & labels: +4. Extract training samples & labels: ```bash python3 read_dataset.py ``` -3. Compile the application: +5. Compile the MLP: ```bash make ``` +6. Run the MLP: + +```bash +./build/mlp +``` + With this command, you can use: -- `-DVERBOSE` for the verbose mode, which prints loss deltas for all epochs -- `-DDEBUG` for the debug mode, which prints a couple samples & labels at the beginning and all weights at the end -- `-DBATCH_SIZE=...` to configure the batch size used during training -- `-DMAX_EPOCH=...` to configure the maximum number of epochs the training can run for -- `-DEPSILON=...` to configure epsilon from the command line -- `-DLEARNING_RATE=...` to configure learning rate from the command line -- `-DDECAY_RATE=...` to configure the decay rate of the learning rate -- `-DMOMENTUM=...` to configure momentum from the command line -- `-DNUM_TRAIN_SAMPLES=...` to configure from the command line how many samples the model should be trained with -- `-DTRAINING_SAMPLES_FILE=...` to configure the path to the text file samples should be sourced from -- `-DTRAINING_LABELS_FILE=...` to configure the path to the text file labels should be sourced from +- `BATCH_SIZE=...` to configure the batch size used during training, which otherwise defaults to 20 +- `MAX_EPOCH=...` to configure the maximum number of epochs the training can run for, which otherwise defaults to 10 +- `NUM_TRAIN_SAMPLES=...` to configure from the command line how many samples the model should be trained with, which otherwise defaults to 200 +- `UPMEM=0` to turn off matrix multiplication on UPMEM +- `SAN=1` to run the MLP with GCC sanitizer +- `EVAL=1` to run the MLP in evaluation mode, which adds to the printout how many cycles are spent in training -## Status +## Running the Unit Tests + +UPMEM-MLP comes with unit tests, which can be found in `tests/`. Run these unit tests using: -UPMEM-MLP is a work in progress as of 2025-11-21. +```bash +mkdir build +cd build +cmake .. +make +make test +``` -### To-Do +## Status -- [ ] Evaluate and document acceleration achieved by matrix multiplication on UPMEM DIMM +UPMEM-MLP is completed and being actively maintained as of 2025-11-23. ## License UPMEM-MLP is licensed under the Apache License v2.0. See [LICENSE](LICENSE) for more details. ---- \ No newline at end of file +--- From 6b5c2649188f3935e2cbb3bc35254af4c4f8bff4 Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Sun, 23 Nov 2025 22:00:44 +0100 Subject: [PATCH 30/32] Add some logging to `src/dpu/dpu_program.c` --- src/dpu/dpu_program.c | 11 +++++++++++ src/host/dpu_host.c | 5 +++++ 2 files changed, 16 insertions(+) diff --git a/src/dpu/dpu_program.c b/src/dpu/dpu_program.c index 1155bac..cea4413 100644 --- a/src/dpu/dpu_program.c +++ b/src/dpu/dpu_program.c @@ -1,5 +1,7 @@ #include #include +#include +#include #include "upmem.h" __mram_noinit float A_chunk[TILE_SIZE * TILE_SIZE]; @@ -10,6 +12,8 @@ __host dpu_args_t DPU_INPUT_ARGS; int main() { + perfcounter_config(COUNT_CYCLES, false); + dpu_args_t dpu_input_args = DPU_INPUT_ARGS; uint32_t rows_a = dpu_input_args.rows_a; uint32_t cols_a = dpu_input_args.cols_a; @@ -18,6 +22,8 @@ int main() if(!rows_a) return 0; + perfcounter_t cc_start = perfcounter_get(); + int chunk = rows_a / NR_TASKLETS; int row_start = chunk * me(); @@ -30,6 +36,11 @@ int main() C_chunk[i * cols_b + j] = sum; } } + + perfcounter_t cc_end = perfcounter_get(); + + if(me() == 0) + printf("DPU completed in %ld cycles\n", cc_end-cc_start); return 0; } \ No newline at end of file diff --git a/src/host/dpu_host.c b/src/host/dpu_host.c index c09115f..17bc719 100644 --- a/src/host/dpu_host.c +++ b/src/host/dpu_host.c @@ -1,5 +1,6 @@ #include #include +#include #include #include "upmem.h" @@ -115,6 +116,10 @@ void process_tile_upmem(const float *A, const float *B, float *C, int rows_a, in DPU_ASSERT(dpu_launch(dpus, DPU_SYNCHRONOUS)); + DPU_FOREACH(dpus, dpu) { + DPU_ASSERT(dpu_log_read(dpu, stdout)); + } + dpu_idx = 0; DPU_FOREACH(dpus, dpu) { From 5f3fb0ab959e89de53c6c6193db05cefd08f883f Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Sun, 23 Nov 2025 22:01:41 +0100 Subject: [PATCH 31/32] Register benchmarking results in `benchmarks.md` --- benchmarks.md | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 benchmarks.md diff --git a/benchmarks.md b/benchmarks.md new file mode 100644 index 0000000..ce5ca11 --- /dev/null +++ b/benchmarks.md @@ -0,0 +1,10 @@ +# Benchmark Results + +## NN Layout: NUM_FEATURES -> 4096 -> 4096 -> 2048 -> NUM_LABELS + +| BATCH_SIZE | NUM_TRAIN_SAMPLES | MAX_EPOCH | Cycles (Intel 64 Host) | Cycles (Intel 64 Host + UPMEM) | +|------------|-------------------|-----------|------------------------|--------------------------------| +| 1200 | 3600 | 1 | 13.05T | 12.73T | +| 3600 | 10800 | 1 | 42.38T | 39.49T | + +--- From c9d9af1b21c03f79c5b868d2bbcc66ce0328f151 Mon Sep 17 00:00:00 2001 From: Kagan Dikmen Date: Sun, 23 Nov 2025 22:03:02 +0100 Subject: [PATCH 32/32] Update CI for recent changes in upmem-sdk repo --- .github/build_upmem_toolchain.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/build_upmem_toolchain.sh b/.github/build_upmem_toolchain.sh index cf7b9ab..cd157ec 100644 --- a/.github/build_upmem_toolchain.sh +++ b/.github/build_upmem_toolchain.sh @@ -2,6 +2,6 @@ cd /opt/ git clone https://github.com/kagandikmen/upmem-sdk.git -tar -xvf upmem-sdk/upmem-2024.2.0-Linux-x86_64.tar.gz +tar -xvf upmem-sdk/2024.2.0/upmem-2024.2.0-Linux-x86_64.tar.gz mv upmem-2024.2.0-Linux-x86_64/ /usr/local/bin/ rm -rf upmem-sdk/ \ No newline at end of file