From c58048a156350947ff45f7a2e8a5dddc07176513 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Thu, 20 Nov 2025 19:32:37 +0100
Subject: [PATCH 01/32] Introduce matrix multiplication wrapper function

---
 include/mlp.h | 1 +
 src/matrix.c  | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/include/mlp.h b/include/mlp.h
index b2a6616..9ef6cdf 100644
--- a/include/mlp.h
+++ b/include/mlp.h
@@ -63,6 +63,7 @@ LAYER *init_layer(int num_neurons, int num_weights_per_neuron, int batch_size);
 NETWORK *init_network(int num_inputs, int num_layers, int *num_inputs_per_layer, int batch_size);
 NEURON *init_neuron(int num_weights);
 void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b);
+void multiply_matrix_naive(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b);
 uint8_t **read_image_data(const char *filename, int *num_rows, const int num_cols);
 double sse(double *real, double *ideal, int length);
 void transpose_matrix(const double *A, double *C, int rows, int cols);
diff --git a/src/matrix.c b/src/matrix.c
index 6b40dc2..c070de8 100644
--- a/src/matrix.c
+++ b/src/matrix.c
@@ -1,6 +1,11 @@
 #include "mlp.h"
 
 void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
+{
+    multiply_matrix_naive(A, B, C, rows_a, cols_a, cols_b);
+}
+
+void multiply_matrix_naive(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
 {
     for(int i=0; i<rows_a; i++) {
         for(int j=0; j<cols_b; j++) {

From 566cb42b6b7ae2622e84afa68b536147e12d1094 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Thu, 20 Nov 2025 19:50:36 +0100
Subject: [PATCH 02/32] Create `legacy/` & move old UPMEM files inside

---
 Makefile => legacy/Makefile                   | 0
 generate.py => legacy/generate.py             | 0
 host.c => legacy/host.c                       | 0
 matmul.template => legacy/matmul.template     | 0
 matrices.template => legacy/matrices.template | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename Makefile => legacy/Makefile (100%)
 rename generate.py => legacy/generate.py (100%)
 rename host.c => legacy/host.c (100%)
 rename matmul.template => legacy/matmul.template (100%)
 rename matrices.template => legacy/matrices.template (100%)

diff --git a/Makefile b/legacy/Makefile
similarity index 100%
rename from Makefile
rename to legacy/Makefile
diff --git a/generate.py b/legacy/generate.py
similarity index 100%
rename from generate.py
rename to legacy/generate.py
diff --git a/host.c b/legacy/host.c
similarity index 100%
rename from host.c
rename to legacy/host.c
diff --git a/matmul.template b/legacy/matmul.template
similarity index 100%
rename from matmul.template
rename to legacy/matmul.template
diff --git a/matrices.template b/legacy/matrices.template
similarity index 100%
rename from matrices.template
rename to legacy/matrices.template

From ae403fab97f07eb66247caeaff268421c341b527 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Thu, 20 Nov 2025 21:25:39 +0100
Subject: [PATCH 03/32] Implement tiled matrix multiplication

---
 include/mlp.h |  2 ++
 src/matrix.c  | 43 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/include/mlp.h b/include/mlp.h
index 9ef6cdf..4eef999 100644
--- a/include/mlp.h
+++ b/include/mlp.h
@@ -31,6 +31,8 @@
 #define MOMENTUM 0.8
 #endif
 
+#define TILE_SIZE 16
+
 extern unsigned int rseed;
 
 typedef struct {
diff --git a/src/matrix.c b/src/matrix.c
index c070de8..6355cee 100644
--- a/src/matrix.c
+++ b/src/matrix.c
@@ -2,7 +2,48 @@
 
 void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
 {
-    multiply_matrix_naive(A, B, C, rows_a, cols_a, cols_b);
+    double tileA[TILE_SIZE][TILE_SIZE];
+    double tileB[TILE_SIZE][TILE_SIZE];
+    double tileC[TILE_SIZE][TILE_SIZE];
+
+    for(int i=0; i<rows_a; ++i) {
+        for(int j=0; j<cols_b; ++j) {
+            C[i * cols_b + j] = 0;
+        }
+    }
+
+    for(int ii=0; ii<rows_a; ii+=TILE_SIZE) {
+        for(int jj=0; jj<cols_b; jj+=TILE_SIZE) {
+            for(int kk=0; kk<cols_a; kk+=TILE_SIZE) {
+    
+                for(int i=0; i<TILE_SIZE; ++i) {
+                    for(int j=0; j<TILE_SIZE; ++j) {
+                        if(ii + i < rows_a && kk + j < cols_a) {
+                            tileA[i][j] = A[(ii + i) * cols_a + (kk + j)];
+                        } else {
+                            tileA[i][j] = 0;
+                        }
+    
+                        if(kk + i < cols_a && jj + j < cols_b) {
+                            tileB[i][j] = B[(kk + i) * cols_b + (jj + j)];
+                        } else {
+                            tileB[i][j] = 0;
+                        }
+                    }
+                }
+    
+                multiply_matrix_naive(&tileA[0][0], &tileB[0][0], &tileC[0][0], TILE_SIZE, TILE_SIZE, TILE_SIZE);
+    
+                for(int row=0; row<TILE_SIZE; row++) {
+                    for(int col=0; col<TILE_SIZE; col++) {
+                        if(ii + row<rows_a && jj + col<cols_b) {
+                            C[(ii + row) * cols_b + (jj + col)] += tileC[row][col];
+                        }
+                    }
+                }
+            }
+        }
+    }
 }
 
 void multiply_matrix_naive(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)

From 0556b1076646f4c1aeb48cf1217d61e7b47ddacd Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Fri, 21 Nov 2025 08:30:14 +0100
Subject: [PATCH 04/32] Implement UPMEM-distributed matrix multiplication

---
 dpu_program.c   | 32 +++++++++++++++++++
 include/upmem.h | 21 +++++++++++++
 src/matrix.c    | 82 ++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 134 insertions(+), 1 deletion(-)
 create mode 100644 dpu_program.c
 create mode 100644 include/upmem.h

diff --git a/dpu_program.c b/dpu_program.c
new file mode 100644
index 0000000..193587e
--- /dev/null
+++ b/dpu_program.c
@@ -0,0 +1,32 @@
+#include <mram.h>
+#include <defs.h>
+#include "upmem.h"
+
+__mram_noinit double A_chunk[ROWS_A_PER_DPU_MAX * COLS_A_MAX];
+__mram_noinit double B_whole[COLS_A_MAX * COLS_B_MAX];
+__mram_noinit double C_chunk[ROWS_A_PER_DPU_MAX * COLS_B_MAX];
+
+__host dpu_args_t DPU_INPUT_ARGS;
+
+int main()
+{
+    dpu_args_t dpu_input_args = DPU_INPUT_ARGS;
+    uint32_t rows_a = dpu_input_args.rows_a;
+    uint32_t cols_a = dpu_input_args.cols_a;
+    uint32_t cols_b = dpu_input_args.cols_b;
+    
+    if(!rows_a)
+        return 0;
+
+    for(int i=0; i<rows_a; ++i) {
+        for(int j=0; j<cols_b; ++j) {
+            double sum = 0;
+            for(int k=0; k<cols_a; ++k) {
+                sum += A_chunk[i * cols_a + k] * B_whole[k * cols_b + j];
+            }
+            C_chunk[i * cols_b + j] = sum;
+        }
+    }
+    
+    return 0;
+}
\ No newline at end of file
diff --git a/include/upmem.h b/include/upmem.h
new file mode 100644
index 0000000..67b3069
--- /dev/null
+++ b/include/upmem.h
@@ -0,0 +1,21 @@
+#ifndef UPMEM_H
+#define UPMEM_H
+
+#include <stdint.h>
+
+#define ROWS_A_MAX 96
+#define COLS_A_MAX 96
+#define COLS_B_MAX 96
+#define ROWS_A_PER_DPU_MAX 16
+
+#define NUM_DPU 16
+
+typedef struct {
+    uint32_t rows_a;
+    uint32_t cols_a;
+    uint32_t cols_b;
+} dpu_args_t;
+
+void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b);
+
+#endif
diff --git a/src/matrix.c b/src/matrix.c
index 6355cee..dbc6e9a 100644
--- a/src/matrix.c
+++ b/src/matrix.c
@@ -1,4 +1,7 @@
+#include <dpu.h>
+#include <dpu_log.h>
 #include "mlp.h"
+#include "upmem.h"
 
 void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
 {
@@ -32,7 +35,8 @@ void multiply_matrix(const double *A, const double *B, double *C, int rows_a, in
                     }
                 }
     
-                multiply_matrix_naive(&tileA[0][0], &tileB[0][0], &tileC[0][0], TILE_SIZE, TILE_SIZE, TILE_SIZE);
+                // multiply_matrix_naive(&tileA[0][0], &tileB[0][0], &tileC[0][0], TILE_SIZE, TILE_SIZE, TILE_SIZE);
+                multiply_matrix_upmem(&tileA[0][0], &tileB[0][0], &tileC[0][0], TILE_SIZE, TILE_SIZE, TILE_SIZE);
     
                 for(int row=0; row<TILE_SIZE; row++) {
                     for(int col=0; col<TILE_SIZE; col++) {
@@ -59,6 +63,82 @@ void multiply_matrix_naive(const double *A, const double *B, double *C, int rows
     }
 }
 
+void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
+{
+    struct dpu_set_t dpus, dpu;
+
+    DPU_ASSERT(dpu_alloc(NUM_DPU, NULL, &dpus));
+    DPU_ASSERT(dpu_load(dpus, "build/dpu_program", NULL));
+
+    uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU;
+
+    uint32_t dpu_idx = 0;
+    DPU_FOREACH(dpus, dpu) {
+        uint32_t row_start = dpu_idx * dpu_rows_a_max;
+        uint32_t dpu_rows_a_actual = (row_start >= rows_a) ? 0
+                                   : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start)
+                                   : dpu_rows_a_max;
+
+        dpu_args_t args = {
+            .rows_a = dpu_rows_a_actual,
+            .cols_a = cols_a,
+            .cols_b = cols_b
+        };
+
+        DPU_ASSERT(dpu_copy_to(dpu, "DPU_INPUT_ARGS", 0, &args, sizeof(args)));
+
+        if(dpu_rows_a_actual) {
+            uint32_t elems_a = dpu_rows_a_actual * cols_a;
+            uint32_t bytes_a = elems_a * sizeof(double);
+
+            double *A_chunk = (double*)malloc(bytes_a);
+
+            for(int r=0; r<dpu_rows_a_actual; ++r) {
+                unsigned int global_row = row_start + r;
+                memcpy(&A_chunk[r*cols_a], &A[global_row*cols_a], cols_a*sizeof(double));
+            }
+
+            DPU_ASSERT(dpu_copy_to(dpu, "A_chunk", 0, A_chunk, bytes_a));
+            free(A_chunk);
+        }
+
+        unsigned int bytes_b = cols_a * cols_b * sizeof(double);
+        DPU_ASSERT(dpu_copy_to(dpu, "B_whole", 0, B, bytes_b));
+
+        dpu_idx++;
+    }
+
+    DPU_ASSERT(dpu_launch(dpus, DPU_SYNCHRONOUS));
+
+    dpu_idx = 0;
+    DPU_FOREACH(dpus, dpu) {
+        uint32_t row_start = dpu_idx * dpu_rows_a_max;
+        uint32_t dpu_rows_a_actual = (row_start >= rows_a) ? 0
+                                   : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start)
+                                   : dpu_rows_a_max;
+
+        if(dpu_rows_a_actual) {
+            uint32_t elems_c = dpu_rows_a_actual * cols_b;
+            uint32_t bytes_c = elems_c * sizeof(double);
+
+            double *C_chunk = (double*)malloc(bytes_c);
+
+            DPU_ASSERT(dpu_copy_from(dpu, "C_chunk", 0, C_chunk, bytes_c));
+
+            for(int r=0; r<dpu_rows_a_actual; ++r) {
+                unsigned int global_row = row_start + r;
+                memcpy(&C[global_row * cols_b], &C_chunk[r * cols_b], cols_b*sizeof(double));
+            }
+
+            free(C_chunk);
+        }
+
+        dpu_idx++;
+    }
+
+    dpu_free(dpus);
+}
+
 void transpose_matrix(const double* A, double *C, int rows, int cols)
 {
     for(int i=0; i<rows; i++) {

From 15305f598ec31cd80e0bb296735c5fe09eeeaee8 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Fri, 21 Nov 2025 08:39:01 +0100
Subject: [PATCH 05/32] Improve file organization

---
 .gitignore                                  | 1 -
 include/upmem.h                             | 2 ++
 dpu_program.c => src/dpu/dpu_program.c      | 0
 src/{ => host}/accumulate_layer_gradients.c | 0
 src/{ => host}/activation.c                 | 0
 src/{ => host}/apply_gradients.c            | 0
 src/{ => host}/drand.c                      | 0
 src/{ => host}/get_delta.c                  | 0
 src/{ => host}/get_total_loss.c             | 0
 src/{ => host}/get_y.c                      | 0
 src/{ => host}/get_z.c                      | 0
 src/{ => host}/init_layer.c                 | 0
 src/{ => host}/init_network.c               | 0
 src/{ => host}/init_neuron.c                | 0
 src/{ => host}/matrix.c                     | 2 +-
 src/{ => host}/mlp.c                        | 0
 src/{ => host}/read_image_data.c            | 0
 src/{ => host}/sse.c                        | 0
 src/{ => host}/utils.c                      | 0
 19 files changed, 3 insertions(+), 2 deletions(-)
 rename dpu_program.c => src/dpu/dpu_program.c (100%)
 rename src/{ => host}/accumulate_layer_gradients.c (100%)
 rename src/{ => host}/activation.c (100%)
 rename src/{ => host}/apply_gradients.c (100%)
 rename src/{ => host}/drand.c (100%)
 rename src/{ => host}/get_delta.c (100%)
 rename src/{ => host}/get_total_loss.c (100%)
 rename src/{ => host}/get_y.c (100%)
 rename src/{ => host}/get_z.c (100%)
 rename src/{ => host}/init_layer.c (100%)
 rename src/{ => host}/init_network.c (100%)
 rename src/{ => host}/init_neuron.c (100%)
 rename src/{ => host}/matrix.c (98%)
 rename src/{ => host}/mlp.c (100%)
 rename src/{ => host}/read_image_data.c (100%)
 rename src/{ => host}/sse.c (100%)
 rename src/{ => host}/utils.c (100%)

diff --git a/.gitignore b/.gitignore
index 24a870c..274d4e3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,5 @@
 matmul.c
 matrices.h
-dpu/
 *.o
 *.out
 training_images.txt
diff --git a/include/upmem.h b/include/upmem.h
index 67b3069..a2feb78 100644
--- a/include/upmem.h
+++ b/include/upmem.h
@@ -10,6 +10,8 @@
 
 #define NUM_DPU 16
 
+#define DPU_BINARY_PATH "build/dpu_program"
+
 typedef struct {
     uint32_t rows_a;
     uint32_t cols_a;
diff --git a/dpu_program.c b/src/dpu/dpu_program.c
similarity index 100%
rename from dpu_program.c
rename to src/dpu/dpu_program.c
diff --git a/src/accumulate_layer_gradients.c b/src/host/accumulate_layer_gradients.c
similarity index 100%
rename from src/accumulate_layer_gradients.c
rename to src/host/accumulate_layer_gradients.c
diff --git a/src/activation.c b/src/host/activation.c
similarity index 100%
rename from src/activation.c
rename to src/host/activation.c
diff --git a/src/apply_gradients.c b/src/host/apply_gradients.c
similarity index 100%
rename from src/apply_gradients.c
rename to src/host/apply_gradients.c
diff --git a/src/drand.c b/src/host/drand.c
similarity index 100%
rename from src/drand.c
rename to src/host/drand.c
diff --git a/src/get_delta.c b/src/host/get_delta.c
similarity index 100%
rename from src/get_delta.c
rename to src/host/get_delta.c
diff --git a/src/get_total_loss.c b/src/host/get_total_loss.c
similarity index 100%
rename from src/get_total_loss.c
rename to src/host/get_total_loss.c
diff --git a/src/get_y.c b/src/host/get_y.c
similarity index 100%
rename from src/get_y.c
rename to src/host/get_y.c
diff --git a/src/get_z.c b/src/host/get_z.c
similarity index 100%
rename from src/get_z.c
rename to src/host/get_z.c
diff --git a/src/init_layer.c b/src/host/init_layer.c
similarity index 100%
rename from src/init_layer.c
rename to src/host/init_layer.c
diff --git a/src/init_network.c b/src/host/init_network.c
similarity index 100%
rename from src/init_network.c
rename to src/host/init_network.c
diff --git a/src/init_neuron.c b/src/host/init_neuron.c
similarity index 100%
rename from src/init_neuron.c
rename to src/host/init_neuron.c
diff --git a/src/matrix.c b/src/host/matrix.c
similarity index 98%
rename from src/matrix.c
rename to src/host/matrix.c
index dbc6e9a..12220b6 100644
--- a/src/matrix.c
+++ b/src/host/matrix.c
@@ -68,7 +68,7 @@ void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows
     struct dpu_set_t dpus, dpu;
 
     DPU_ASSERT(dpu_alloc(NUM_DPU, NULL, &dpus));
-    DPU_ASSERT(dpu_load(dpus, "build/dpu_program", NULL));
+    DPU_ASSERT(dpu_load(dpus, DPU_BINARY_PATH, NULL));
 
     uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU;
 
diff --git a/src/mlp.c b/src/host/mlp.c
similarity index 100%
rename from src/mlp.c
rename to src/host/mlp.c
diff --git a/src/read_image_data.c b/src/host/read_image_data.c
similarity index 100%
rename from src/read_image_data.c
rename to src/host/read_image_data.c
diff --git a/src/sse.c b/src/host/sse.c
similarity index 100%
rename from src/sse.c
rename to src/host/sse.c
diff --git a/src/utils.c b/src/host/utils.c
similarity index 100%
rename from src/utils.c
rename to src/host/utils.c

From 60f14325b952e2084504dd93de032be8eb9f1148 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Fri, 21 Nov 2025 09:50:09 +0100
Subject: [PATCH 06/32] Add Makefile

---
 Makefile | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 Makefile

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..7b7fdb4
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,11 @@
+DPU_UPMEM_CLANG = dpu-upmem-dpurte-clang
+DPU_UPMEM_CFLAGS += 
+FILES_TO_DELETE = build/
+
+all: clean
+	mkdir build; \
+	$(DPU_UPMEM_CLANG) $(DPU_UPMEM_CFLAGS) -Iinclude -o build/dpu_program src/dpu/dpu_program.c; \
+	gcc -std=c99 -O0 -Iinclude src/host/*.c -D_GNU_SOURCE -DVERBOSE -DDEBUG -DBATCH_SIZE=2 -DMAX_EPOCH=8 -DNUM_TRAIN_SAMPLES=40 -o build/mlp -lm `dpu-pkg-config --cflags --libs dpu`
+
+clean:
+	rm -rf $(FILES_TO_DELETE)

From dd5d1cdb0aeb1e6f37bb6aef21639229d69bb6bc Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Fri, 21 Nov 2025 14:02:54 +0100
Subject: [PATCH 07/32] Move `dpu_load` to improve runtime

---
 src/host/matrix.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/host/matrix.c b/src/host/matrix.c
index 12220b6..bb66fe3 100644
--- a/src/host/matrix.c
+++ b/src/host/matrix.c
@@ -68,12 +68,12 @@ void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows
     struct dpu_set_t dpus, dpu;
 
     DPU_ASSERT(dpu_alloc(NUM_DPU, NULL, &dpus));
-    DPU_ASSERT(dpu_load(dpus, DPU_BINARY_PATH, NULL));
-
-    uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU;
 
     uint32_t dpu_idx = 0;
     DPU_FOREACH(dpus, dpu) {
+        DPU_ASSERT(dpu_load(dpu, DPU_BINARY_PATH, NULL));
+
+        uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU;
         uint32_t row_start = dpu_idx * dpu_rows_a_max;
         uint32_t dpu_rows_a_actual = (row_start >= rows_a) ? 0
                                    : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start)
@@ -112,6 +112,7 @@ void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows
 
     dpu_idx = 0;
     DPU_FOREACH(dpus, dpu) {
+        uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU;
         uint32_t row_start = dpu_idx * dpu_rows_a_max;
         uint32_t dpu_rows_a_actual = (row_start >= rows_a) ? 0
                                    : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start)
@@ -136,7 +137,7 @@ void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows
         dpu_idx++;
     }
 
-    dpu_free(dpus);
+    DPU_ASSERT(dpu_free(dpus));
 }
 
 void transpose_matrix(const double* A, double *C, int rows, int cols)

From bf0b070d90b01627c2398f6a5906031c7d8888c0 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Fri, 21 Nov 2025 14:28:55 +0100
Subject: [PATCH 08/32] Broadcast matrix `B` to DPUs instead of copying it

---
 src/host/matrix.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/host/matrix.c b/src/host/matrix.c
index bb66fe3..f6edbfc 100644
--- a/src/host/matrix.c
+++ b/src/host/matrix.c
@@ -68,12 +68,16 @@ void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows
     struct dpu_set_t dpus, dpu;
 
     DPU_ASSERT(dpu_alloc(NUM_DPU, NULL, &dpus));
+    DPU_ASSERT(dpu_load(dpus, DPU_BINARY_PATH, NULL));
+
+    unsigned int bytes_b = cols_a * cols_b * sizeof(double);
+    DPU_ASSERT(dpu_broadcast_to(dpus, "B_whole", 0, B, bytes_b, DPU_XFER_DEFAULT));
+
+    uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU;
 
     uint32_t dpu_idx = 0;
     DPU_FOREACH(dpus, dpu) {
-        DPU_ASSERT(dpu_load(dpu, DPU_BINARY_PATH, NULL));
 
-        uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU;
         uint32_t row_start = dpu_idx * dpu_rows_a_max;
         uint32_t dpu_rows_a_actual = (row_start >= rows_a) ? 0
                                    : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start)
@@ -102,9 +106,6 @@ void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows
             free(A_chunk);
         }
 
-        unsigned int bytes_b = cols_a * cols_b * sizeof(double);
-        DPU_ASSERT(dpu_copy_to(dpu, "B_whole", 0, B, bytes_b));
-
         dpu_idx++;
     }
 
@@ -112,7 +113,7 @@ void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows
 
     dpu_idx = 0;
     DPU_FOREACH(dpus, dpu) {
-        uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU;
+        
         uint32_t row_start = dpu_idx * dpu_rows_a_max;
         uint32_t dpu_rows_a_actual = (row_start >= rows_a) ? 0
                                    : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start)

From 05d42ca5c93a3e5ce64eb2e4c8fcfdb7efbc507b Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Fri, 21 Nov 2025 15:23:43 +0100
Subject: [PATCH 09/32] Move DPU allocation and loading before tiling

---
 src/host/matrix.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/host/matrix.c b/src/host/matrix.c
index f6edbfc..d9836cc 100644
--- a/src/host/matrix.c
+++ b/src/host/matrix.c
@@ -3,6 +3,8 @@
 #include "mlp.h"
 #include "upmem.h"
 
+struct dpu_set_t dpus, dpu;
+
 void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
 {
     double tileA[TILE_SIZE][TILE_SIZE];
@@ -15,6 +17,9 @@ void multiply_matrix(const double *A, const double *B, double *C, int rows_a, in
         }
     }
 
+    DPU_ASSERT(dpu_alloc(NUM_DPU, NULL, &dpus));
+    DPU_ASSERT(dpu_load(dpus, DPU_BINARY_PATH, NULL));
+
     for(int ii=0; ii<rows_a; ii+=TILE_SIZE) {
         for(int jj=0; jj<cols_b; jj+=TILE_SIZE) {
             for(int kk=0; kk<cols_a; kk+=TILE_SIZE) {
@@ -48,6 +53,8 @@ void multiply_matrix(const double *A, const double *B, double *C, int rows_a, in
             }
         }
     }
+
+    DPU_ASSERT(dpu_free(dpus));
 }
 
 void multiply_matrix_naive(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
@@ -65,11 +72,7 @@ void multiply_matrix_naive(const double *A, const double *B, double *C, int rows
 
 void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
 {
-    struct dpu_set_t dpus, dpu;
-
-    DPU_ASSERT(dpu_alloc(NUM_DPU, NULL, &dpus));
-    DPU_ASSERT(dpu_load(dpus, DPU_BINARY_PATH, NULL));
-
+    
     unsigned int bytes_b = cols_a * cols_b * sizeof(double);
     DPU_ASSERT(dpu_broadcast_to(dpus, "B_whole", 0, B, bytes_b, DPU_XFER_DEFAULT));
 
@@ -137,8 +140,6 @@ void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows
 
         dpu_idx++;
     }
-
-    DPU_ASSERT(dpu_free(dpus));
 }
 
 void transpose_matrix(const double* A, double *C, int rows, int cols)

From f979a7fd97e90992c412c2ff967c104be2d20b16 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Fri, 21 Nov 2025 15:42:22 +0100
Subject: [PATCH 10/32] Reorganize matrix multiplication functions

---
 include/upmem.h   |  1 +
 src/host/matrix.c | 11 ++++++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/include/upmem.h b/include/upmem.h
index a2feb78..09ae4e5 100644
--- a/include/upmem.h
+++ b/include/upmem.h
@@ -19,5 +19,6 @@ typedef struct {
 } dpu_args_t;
 
 void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b);
+void process_tile_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b);
 
 #endif
diff --git a/src/host/matrix.c b/src/host/matrix.c
index d9836cc..f696e79 100644
--- a/src/host/matrix.c
+++ b/src/host/matrix.c
@@ -6,6 +6,12 @@
 struct dpu_set_t dpus, dpu;
 
 void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
+{
+    multiply_matrix_upmem(A, B, C, rows_a, cols_a, cols_b);
+    // multiply_matrix_naive(A, B, C, rows_a, cols_a, cols_b);
+}
+
+void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
 {
     double tileA[TILE_SIZE][TILE_SIZE];
     double tileB[TILE_SIZE][TILE_SIZE];
@@ -40,8 +46,7 @@ void multiply_matrix(const double *A, const double *B, double *C, int rows_a, in
                     }
                 }
     
-                // multiply_matrix_naive(&tileA[0][0], &tileB[0][0], &tileC[0][0], TILE_SIZE, TILE_SIZE, TILE_SIZE);
-                multiply_matrix_upmem(&tileA[0][0], &tileB[0][0], &tileC[0][0], TILE_SIZE, TILE_SIZE, TILE_SIZE);
+                process_tile_upmem(&tileA[0][0], &tileB[0][0], &tileC[0][0], TILE_SIZE, TILE_SIZE, TILE_SIZE);
     
                 for(int row=0; row<TILE_SIZE; row++) {
                     for(int col=0; col<TILE_SIZE; col++) {
@@ -70,7 +75,7 @@ void multiply_matrix_naive(const double *A, const double *B, double *C, int rows
     }
 }
 
-void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
+void process_tile_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
 {
     
     unsigned int bytes_b = cols_a * cols_b * sizeof(double);

From 6b005fc42e6d06c3db184f0a3100c60bb06cf0ba Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Fri, 21 Nov 2025 16:17:36 +0100
Subject: [PATCH 11/32] Add feature toggle for UPMEM to `multiply_matrix`

---
 src/host/matrix.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/host/matrix.c b/src/host/matrix.c
index f696e79..f8b30f1 100644
--- a/src/host/matrix.c
+++ b/src/host/matrix.c
@@ -7,8 +7,11 @@ struct dpu_set_t dpus, dpu;
 
 void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
 {
+#ifdef UPMEM
     multiply_matrix_upmem(A, B, C, rows_a, cols_a, cols_b);
-    // multiply_matrix_naive(A, B, C, rows_a, cols_a, cols_b);
+#else
+    multiply_matrix_naive(A, B, C, rows_a, cols_a, cols_b);
+#endif
 }
 
 void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)

From c72628bf9332aed46e432f3f2454820733995a9f Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Fri, 21 Nov 2025 16:23:02 +0100
Subject: [PATCH 12/32] Adapt unit tests for updates in `multiply_matrix`

---
 CMakeLists.txt      | 31 ++++++++++++++++++++++++++++---
 include/upmem.h     |  4 ++++
 tests/test_matrix.c | 15 ++++++++++++---
 3 files changed, 44 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d408d10..ee5fc92 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,18 +6,43 @@ set(CMAKE_C_STANDARD_REQUIRED ON)
 
 include_directories(include)
 
-file(GLOB SRC_FILES src/*.c)
-list(REMOVE_ITEM SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/mlp.c")
+file(GLOB SRC_FILES src/host/*.c)
+list(REMOVE_ITEM SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/host/mlp.c")
 file(GLOB TEST_FILES tests/*.c)
 
+execute_process(
+    COMMAND dpu-pkg-config --cflags dpu
+    OUTPUT_VARIABLE DPU_C_FLAGS
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+execute_process(
+    COMMAND dpu-pkg-config --libs dpu
+    OUTPUT_VARIABLE DPU_LIBS
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
 enable_testing()
 
+add_custom_target(build_dpu_program ALL
+    COMMAND dpu-upmem-dpurte-clang
+            -I${CMAKE_SOURCE_DIR}/include
+            -o ${CMAKE_BINARY_DIR}/dpu_program
+            ${CMAKE_SOURCE_DIR}/src/dpu/dpu_program.c
+)
+
+add_compile_definitions(
+    NUM_DPU=1
+    DPU_BINARY_PATH=\"./dpu_program\"
+)
+
 foreach(TEST_SRC ${TEST_FILES})
     get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE)
 
     add_executable(${TEST_NAME} ${TEST_SRC} ${SRC_FILES})
     target_include_directories(${TEST_NAME} PRIVATE include)
-    target_link_libraries(${TEST_NAME} m)
+    target_compile_options(${TEST_NAME} PRIVATE ${DPU_C_FLAGS})
+    target_link_libraries(${TEST_NAME} PRIVATE m ${DPU_LIBS})
 
     add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
 endforeach()
\ No newline at end of file
diff --git a/include/upmem.h b/include/upmem.h
index 09ae4e5..aee5348 100644
--- a/include/upmem.h
+++ b/include/upmem.h
@@ -8,9 +8,13 @@
 #define COLS_B_MAX 96
 #define ROWS_A_PER_DPU_MAX 16
 
+#ifndef NUM_DPU
 #define NUM_DPU 16
+#endif
 
+#ifndef DPU_BINARY_PATH
 #define DPU_BINARY_PATH "build/dpu_program"
+#endif
 
 typedef struct {
     uint32_t rows_a;
diff --git a/tests/test_matrix.c b/tests/test_matrix.c
index 25323e9..b77650a 100644
--- a/tests/test_matrix.c
+++ b/tests/test_matrix.c
@@ -12,20 +12,29 @@ int test_multiply_matrix()
                            3.0, 3.0,
                            4.0, 0.0};
     
-    // result matrix (initialized with random double values [0.0, 20.0])
+    // result matrices (initialized with random double values [0.0, 20.0])
     double matrixC[2*2];
+    double matrixD[2*2];
     for(int i=0; i<2*2; i++) {
         matrixC[i] = ((double)rand() / (double)RAND_MAX) * 20;
+        matrixD[i] = ((double)rand() / (double)RAND_MAX) * 20;
     }
     
     // ideal result
     double matrixR[2*2] = {20.0, 12.0,
                            39.0, 15.0};
 
-    multiply_matrix(matrixA, matrixB, matrixC, 2, 3, 2);
+    multiply_matrix_naive(matrixA, matrixB, matrixC, 2, 3, 2);
+
+    multiply_matrix_upmem(matrixA, matrixB, matrixD, 2, 3, 2);
+
+    for(int i=0; i<2*2; i++) {
+        printf("%lf ", matrixC[i]);
+    }
 
     for(int i=0; i<2*2; i++) {
-        test_result_pass_fail |= matrixC[i] == matrixR[i];
+        test_result_pass_fail &= matrixC[i] == matrixR[i];
+        test_result_pass_fail &= matrixC[i] == matrixD[i];
     }
 
     return test_result_pass_fail;

From ed06cc39aedc1677d2ea2547dbb6d99a24959c8a Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Fri, 21 Nov 2025 19:17:11 +0100
Subject: [PATCH 13/32] Reorganize header files & macros

---
 CMakeLists.txt  |  3 ++-
 include/mlp.h   |  2 --
 include/upmem.h | 19 ++++++++++++-------
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ee5fc92..26c5fc9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,7 +32,8 @@ add_custom_target(build_dpu_program ALL
 )
 
 add_compile_definitions(
-    NUM_DPU=1
+    # NUM_CPU=1     Important: This macro override was commented because it does not apply to the dpu-upmem-dpurte-clang execution above; and therefore causes mismatch between
+    #               dpu_program.c and the rest. So this file should avoid modifying dimensions set through macros in aforementioned header files.
     DPU_BINARY_PATH=\"./dpu_program\"
 )
 
diff --git a/include/mlp.h b/include/mlp.h
index 4eef999..9ef6cdf 100644
--- a/include/mlp.h
+++ b/include/mlp.h
@@ -31,8 +31,6 @@
 #define MOMENTUM 0.8
 #endif
 
-#define TILE_SIZE 16
-
 extern unsigned int rseed;
 
 typedef struct {
diff --git a/include/upmem.h b/include/upmem.h
index aee5348..3464057 100644
--- a/include/upmem.h
+++ b/include/upmem.h
@@ -3,19 +3,24 @@
 
 #include <stdint.h>
 
-#define ROWS_A_MAX 96
-#define COLS_A_MAX 96
-#define COLS_B_MAX 96
-#define ROWS_A_PER_DPU_MAX 16
+#define ROWS_A_MAX 320
+#define COLS_A_MAX 320
+#define COLS_B_MAX 320
+
+#ifndef DPU_BINARY_PATH
+#define DPU_BINARY_PATH "build/dpu_program"
+#endif
 
 #ifndef NUM_DPU
-#define NUM_DPU 16
+#define NUM_DPU 64
 #endif
 
-#ifndef DPU_BINARY_PATH
-#define DPU_BINARY_PATH "build/dpu_program"
+#ifndef TILE_SIZE
+#define TILE_SIZE 128
 #endif
 
+#define ROWS_A_PER_DPU_MAX ((ROWS_A_MAX + NUM_DPU - 1) / NUM_DPU)
+
 typedef struct {
     uint32_t rows_a;
     uint32_t cols_a;

From 087f98ddcb07e2e30ce79ba89df2be59ee6b6287 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Fri, 21 Nov 2025 19:17:32 +0100
Subject: [PATCH 14/32] Add some assertions to `multiply_matrix_upmem`

---
 src/host/matrix.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/host/matrix.c b/src/host/matrix.c
index f8b30f1..22cfb28 100644
--- a/src/host/matrix.c
+++ b/src/host/matrix.c
@@ -1,3 +1,4 @@
+#include <assert.h>
 #include <dpu.h>
 #include <dpu_log.h>
 #include "mlp.h"
@@ -16,6 +17,8 @@ void multiply_matrix(const double *A, const double *B, double *C, int rows_a, in
 
 void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
 {
+    assert(TILE_SIZE / NUM_DPU <= ROWS_A_PER_DPU_MAX);
+
     double tileA[TILE_SIZE][TILE_SIZE];
     double tileB[TILE_SIZE][TILE_SIZE];
     double tileC[TILE_SIZE][TILE_SIZE];
@@ -80,6 +83,9 @@ void multiply_matrix_naive(const double *A, const double *B, double *C, int rows
 
 void process_tile_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
 {
+    assert(rows_a <= ROWS_A_MAX);
+    assert(cols_a <= COLS_A_MAX);
+    assert(cols_b <= COLS_B_MAX);
     
     unsigned int bytes_b = cols_a * cols_b * sizeof(double);
     DPU_ASSERT(dpu_broadcast_to(dpus, "B_whole", 0, B, bytes_b, DPU_XFER_DEFAULT));

From 8d02ab460f85c3152d02b4b08aafb768f3329926 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Fri, 21 Nov 2025 22:03:50 +0100
Subject: [PATCH 15/32] Shorten runtime

---
 Makefile       | 2 +-
 src/host/mlp.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 7b7fdb4..d3b345b 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ FILES_TO_DELETE = build/
 all: clean
 	mkdir build; \
 	$(DPU_UPMEM_CLANG) $(DPU_UPMEM_CFLAGS) -Iinclude -o build/dpu_program src/dpu/dpu_program.c; \
-	gcc -std=c99 -O0 -Iinclude src/host/*.c -D_GNU_SOURCE -DVERBOSE -DDEBUG -DBATCH_SIZE=2 -DMAX_EPOCH=8 -DNUM_TRAIN_SAMPLES=40 -o build/mlp -lm `dpu-pkg-config --cflags --libs dpu`
+	gcc -std=c99 -Iinclude src/host/*.c -DUPMEM -D_GNU_SOURCE -DVERBOSE -DDEBUG -DBATCH_SIZE=2 -DMAX_EPOCH=2 -DNUM_TRAIN_SAMPLES=8 -o build/mlp -lm `dpu-pkg-config --cflags --libs dpu`
 
 clean:
 	rm -rf $(FILES_TO_DELETE)
diff --git a/src/host/mlp.c b/src/host/mlp.c
index 012b37d..6a6036b 100644
--- a/src/host/mlp.c
+++ b/src/host/mlp.c
@@ -11,8 +11,8 @@ int main()
     int epoch = 0;
 
     int num_inputs = NUM_FEATURES;
-    int num_layers = 5;
-    int num_neurons_per_layer[] = {NUM_FEATURES, 1000, 1000, 100, NUM_LABELS};
+    int num_layers = 3;
+    int num_neurons_per_layer[] = {NUM_FEATURES, 10, NUM_LABELS};
 
     NETWORK *n = init_network(num_inputs, num_layers, num_neurons_per_layer, BATCH_SIZE);
     if(!n) {

From 8a7b83a0e2ba57caa5432bd58b72f50cc48f5be1 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Fri, 21 Nov 2025 22:04:16 +0100
Subject: [PATCH 16/32] Adapt CI for UPMEM toolchain

---
 .github/build_upmem_toolchain.sh  |  7 +++++++
 .github/workflows/unit_tests.yaml | 20 +++++++++++++++-----
 .github/workflows/valgrind.yaml   |  8 ++++++--
 3 files changed, 28 insertions(+), 7 deletions(-)
 create mode 100644 .github/build_upmem_toolchain.sh

diff --git a/.github/build_upmem_toolchain.sh b/.github/build_upmem_toolchain.sh
new file mode 100644
index 0000000..cf7b9ab
--- /dev/null
+++ b/.github/build_upmem_toolchain.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+cd /opt/
+git clone https://github.com/kagandikmen/upmem-sdk.git
+tar -xvf upmem-sdk/upmem-2024.2.0-Linux-x86_64.tar.gz
+mv upmem-2024.2.0-Linux-x86_64/ /usr/local/bin/
+rm -rf upmem-sdk/
\ No newline at end of file
diff --git a/.github/workflows/unit_tests.yaml b/.github/workflows/unit_tests.yaml
index cba3dcc..8f057a3 100644
--- a/.github/workflows/unit_tests.yaml
+++ b/.github/workflows/unit_tests.yaml
@@ -15,19 +15,29 @@ jobs:
           submodules: 'recursive'
 
       - name: Install dependencies
-        run: sudo apt update && sudo apt install -y build-essential
+        run: | 
+          sudo apt update && sudo apt install -y build-essential
+          sudo ./.github/build_upmem_toolchain.sh
 
       - name: Create build directory
-        run: mkdir build
+        run: |
+          source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh
+          mkdir build
       
       - name: Run CMake
         working-directory: build
-        run: cmake ..
+        run: |
+          source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh
+          cmake ..
 
       - name: Build
         working-directory: build
-        run: make
+        run: |
+          source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh
+          make
 
       - name: Run the tests
         working-directory: build
-        run: make test
\ No newline at end of file
+        run: |
+          source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh
+          make test
\ No newline at end of file
diff --git a/.github/workflows/valgrind.yaml b/.github/workflows/valgrind.yaml
index 69cfa97..f38c27a 100644
--- a/.github/workflows/valgrind.yaml
+++ b/.github/workflows/valgrind.yaml
@@ -19,21 +19,25 @@ jobs:
           sudo apt update
           sudo apt install -y build-essential valgrind
           pip3 install numpy
+          sudo ./.github/build_upmem_toolchain.sh
 
       - name: Extract training samples & labels
         run: python3 read_dataset.py
 
       - name: Compile MLP
-        run: gcc -g -DEPSILON=0.5 -DNUM_TRAIN_SAMPLES=2 -Iinclude src/*.c -o mlp -lm
+        run: |
+          source /usr/local/bin/upmem-sdk/upmem_env.sh
+          make
 
       - name: Run Valgrind
         run: | 
+          source /usr/local/bin/upmem-sdk/upmem_env.sh
           valgrind --leak-check=full \
                    --show-leak-kinds=all \
                    --track-origins=yes \
                    --error-exitcode=1 \
                    --log-file=valgrind.txt \
-                   ./mlp > /dev/null
+                   ./build/mlp > /dev/null
       
       - name: Save Valgrind log
         if: always()

From 5b1646a458fdfb57d55aad1e5bcbe8d1f8bb09c9 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Fri, 21 Nov 2025 22:08:40 +0100
Subject: [PATCH 17/32] Update README

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index a4acb69..7deb945 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ python3 read_dataset.py
 3. Compile the application:
 
 ```bash
-gcc -Iinclude src/*.c -o mlp -lm
+make
 ```
 
 With this command, you can use:
@@ -86,11 +86,11 @@ With this command, you can use:
 
 ## Status
 
-UPMEM-MLP is a work in progress as of 2025-11-14.
+UPMEM-MLP is a work in progress as of 2025-11-21.
 
 ### To-Do
 
-- [ ] Adapt `multiply_matrix` for in-memory matrix multiplication on UPMEM
+- [ ] Evaluate and document acceleration achieved by matrix multiplication on UPMEM DIMM
 
 ## License
 

From 404ab1d0dc7f5785e7c1472cc369da41def2d51f Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <136203535+kagandikmen@users.noreply.github.com>
Date: Sat, 22 Nov 2025 09:07:40 +0100
Subject: [PATCH 18/32] Debug CI (#1)

* Debug CI commit 1

* Debug CI commit 2

* Debug CI commit 3

* Debug CI commit 4

* Debug CI commit 5

* Debug CI commit 6

* Debug CI commit 7

* Debug CI commit 8

* Debug CI commit 9

* Debug CI commit 10

* Debug CI commit 11

* Debug CI commit 12

* Debug CI commit 13

* Debug CI commit 14

* Debug CI commit 15

* Debug CI commit 16

* Debug CI commit 17
---
 .../{valgrind.yaml => memory_leak_tests.yaml} | 27 +++++++++++++------
 .github/workflows/unit_tests.yaml             | 10 +++----
 Makefile                                      | 15 +++++++++--
 3 files changed, 36 insertions(+), 16 deletions(-)
 rename .github/workflows/{valgrind.yaml => memory_leak_tests.yaml} (54%)

diff --git a/.github/workflows/valgrind.yaml b/.github/workflows/memory_leak_tests.yaml
similarity index 54%
rename from .github/workflows/valgrind.yaml
rename to .github/workflows/memory_leak_tests.yaml
index f38c27a..e6a8ba3 100644
--- a/.github/workflows/valgrind.yaml
+++ b/.github/workflows/memory_leak_tests.yaml
@@ -1,4 +1,4 @@
-name: Valgrind
+name: Memory Leak Tests
 
 on:
   push:
@@ -6,7 +6,7 @@ on:
 
 jobs:
   memcheck:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - name: Checkout repository
@@ -19,19 +19,19 @@ jobs:
           sudo apt update
           sudo apt install -y build-essential valgrind
           pip3 install numpy
-          sudo ./.github/build_upmem_toolchain.sh
+          sudo bash .github/build_upmem_toolchain.sh
 
       - name: Extract training samples & labels
         run: python3 read_dataset.py
 
-      - name: Compile MLP
+      - name: Compile MLP without sanitizer or UPMEM
         run: |
-          source /usr/local/bin/upmem-sdk/upmem_env.sh
-          make
+          source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator
+          make SAN=0 UPMEM=0
 
       - name: Run Valgrind
         run: | 
-          source /usr/local/bin/upmem-sdk/upmem_env.sh
+          source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator
           valgrind --leak-check=full \
                    --show-leak-kinds=all \
                    --track-origins=yes \
@@ -44,4 +44,15 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: valgrind_log
-          path: valgrind.txt
\ No newline at end of file
+          path: valgrind.txt
+
+      - name: Compile MLP with sanitizer and UPMEM
+        run: |
+          source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator
+          make clean
+          make SAN=1 UPMEM=1
+
+      - name: Run with sanitizer
+        run: |
+          source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator
+          ./build/mlp > /dev/null
\ No newline at end of file
diff --git a/.github/workflows/unit_tests.yaml b/.github/workflows/unit_tests.yaml
index 8f057a3..897d053 100644
--- a/.github/workflows/unit_tests.yaml
+++ b/.github/workflows/unit_tests.yaml
@@ -6,7 +6,7 @@ on:
 
 jobs:
   build-and-test:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     
     steps:
       - name: Checkout repository
@@ -16,13 +16,11 @@ jobs:
 
       - name: Install dependencies
         run: | 
-          sudo apt update && sudo apt install -y build-essential
-          sudo ./.github/build_upmem_toolchain.sh
+          sudo apt update && sudo apt install -y build-essential python3.10 python3.10-dev
+          sudo bash .github/build_upmem_toolchain.sh
 
       - name: Create build directory
-        run: |
-          source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh
-          mkdir build
+        run: mkdir build
       
       - name: Run CMake
         working-directory: build
diff --git a/Makefile b/Makefile
index d3b345b..6b1b21a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,22 @@
 DPU_UPMEM_CLANG = dpu-upmem-dpurte-clang
 DPU_UPMEM_CFLAGS += 
+CFLAGS += -std=c99 -Iinclude -D_GNU_SOURCE -DVERBOSE -DDEBUG -DBATCH_SIZE=2 -DMAX_EPOCH=2 -DNUM_TRAIN_SAMPLES=8
 FILES_TO_DELETE = build/
 
-all: clean
+UPMEM ?= 1
+ifeq ($(UPMEM), 1)
+	CFLAGS += -DUPMEM
+endif
+
+SAN ?= 0
+ifeq ($(SAN), 1)
+	CFLAGS += -fsanitize=address,undefined,leak -fno-omit-frame-pointer -g
+endif
+
+all:
 	mkdir build; \
 	$(DPU_UPMEM_CLANG) $(DPU_UPMEM_CFLAGS) -Iinclude -o build/dpu_program src/dpu/dpu_program.c; \
-	gcc -std=c99 -Iinclude src/host/*.c -DUPMEM -D_GNU_SOURCE -DVERBOSE -DDEBUG -DBATCH_SIZE=2 -DMAX_EPOCH=2 -DNUM_TRAIN_SAMPLES=8 -o build/mlp -lm `dpu-pkg-config --cflags --libs dpu`
+	gcc src/host/*.c $(CFLAGS) -o build/mlp -lm `dpu-pkg-config --cflags --libs dpu`
 
 clean:
 	rm -rf $(FILES_TO_DELETE)

From d0d4be897ecd0031a60d98a168a7daac71d8d521 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Sat, 22 Nov 2025 10:36:52 +0100
Subject: [PATCH 19/32] Add `<init, free>_dpus` & improve organization

---
 include/upmem.h     |   4 ++
 src/host/dpu_host.c | 144 ++++++++++++++++++++++++++++++++++++++++++++
 src/host/matrix.c   | 134 +----------------------------------------
 src/host/mlp.c      |   6 ++
 tests/test_matrix.c |   3 +
 5 files changed, 158 insertions(+), 133 deletions(-)
 create mode 100644 src/host/dpu_host.c

diff --git a/include/upmem.h b/include/upmem.h
index 3464057..4b90e89 100644
--- a/include/upmem.h
+++ b/include/upmem.h
@@ -27,6 +27,10 @@ typedef struct {
     uint32_t cols_b;
 } dpu_args_t;
 
+extern int upmem_initialized;
+
+void free_dpus();
+void init_dpus();
 void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b);
 void process_tile_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b);
 
diff --git a/src/host/dpu_host.c b/src/host/dpu_host.c
new file mode 100644
index 0000000..e6a767b
--- /dev/null
+++ b/src/host/dpu_host.c
@@ -0,0 +1,144 @@
+#include <assert.h>
+#include <dpu.h>
+#include "upmem.h"
+
+struct dpu_set_t dpus, dpu;
+int upmem_initialized = 0;
+
+void free_dpus()
+{
+    DPU_ASSERT(dpu_free(dpus));
+}
+
+void init_dpus()
+{
+    if(!upmem_initialized) {
+        assert(TILE_SIZE / NUM_DPU <= ROWS_A_PER_DPU_MAX);
+
+        DPU_ASSERT(dpu_alloc(NUM_DPU, NULL, &dpus));
+        DPU_ASSERT(dpu_load(dpus, DPU_BINARY_PATH, NULL));
+
+        upmem_initialized = 1;
+    }
+}
+
+void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
+{
+    double tileA[TILE_SIZE][TILE_SIZE];
+    double tileB[TILE_SIZE][TILE_SIZE];
+    double tileC[TILE_SIZE][TILE_SIZE];
+
+    for(int i=0; i<rows_a; ++i) {
+        for(int j=0; j<cols_b; ++j) {
+            C[i * cols_b + j] = 0;
+        }
+    }
+
+    for(int ii=0; ii<rows_a; ii+=TILE_SIZE) {
+        for(int jj=0; jj<cols_b; jj+=TILE_SIZE) {
+            for(int kk=0; kk<cols_a; kk+=TILE_SIZE) {
+    
+                for(int i=0; i<TILE_SIZE; ++i) {
+                    for(int j=0; j<TILE_SIZE; ++j) {
+                        if(ii + i < rows_a && kk + j < cols_a) {
+                            tileA[i][j] = A[(ii + i) * cols_a + (kk + j)];
+                        } else {
+                            tileA[i][j] = 0;
+                        }
+    
+                        if(kk + i < cols_a && jj + j < cols_b) {
+                            tileB[i][j] = B[(kk + i) * cols_b + (jj + j)];
+                        } else {
+                            tileB[i][j] = 0;
+                        }
+                    }
+                }
+    
+                process_tile_upmem(&tileA[0][0], &tileB[0][0], &tileC[0][0], TILE_SIZE, TILE_SIZE, TILE_SIZE);
+    
+                for(int row=0; row<TILE_SIZE; row++) {
+                    for(int col=0; col<TILE_SIZE; col++) {
+                        if(ii + row<rows_a && jj + col<cols_b) {
+                            C[(ii + row) * cols_b + (jj + col)] += tileC[row][col];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void process_tile_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
+{
+    assert(rows_a <= ROWS_A_MAX);
+    assert(cols_a <= COLS_A_MAX);
+    assert(cols_b <= COLS_B_MAX);
+    
+    unsigned int bytes_b = cols_a * cols_b * sizeof(double);
+    DPU_ASSERT(dpu_broadcast_to(dpus, "B_whole", 0, B, bytes_b, DPU_XFER_DEFAULT));
+
+    uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU;
+
+    uint32_t dpu_idx = 0;
+    DPU_FOREACH(dpus, dpu) {
+
+        uint32_t row_start = dpu_idx * dpu_rows_a_max;
+        uint32_t dpu_rows_a_actual = (row_start >= rows_a) ? 0
+                                   : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start)
+                                   : dpu_rows_a_max;
+
+        dpu_args_t args = {
+            .rows_a = dpu_rows_a_actual,
+            .cols_a = cols_a,
+            .cols_b = cols_b
+        };
+
+        DPU_ASSERT(dpu_copy_to(dpu, "DPU_INPUT_ARGS", 0, &args, sizeof(args)));
+
+        if(dpu_rows_a_actual) {
+            uint32_t elems_a = dpu_rows_a_actual * cols_a;
+            uint32_t bytes_a = elems_a * sizeof(double);
+
+            double *A_chunk = (double*)malloc(bytes_a);
+
+            for(int r=0; r<dpu_rows_a_actual; ++r) {
+                unsigned int global_row = row_start + r;
+                memcpy(&A_chunk[r*cols_a], &A[global_row*cols_a], cols_a*sizeof(double));
+            }
+
+            DPU_ASSERT(dpu_copy_to(dpu, "A_chunk", 0, A_chunk, bytes_a));
+            free(A_chunk);
+        }
+
+        dpu_idx++;
+    }
+
+    DPU_ASSERT(dpu_launch(dpus, DPU_SYNCHRONOUS));
+
+    dpu_idx = 0;
+    DPU_FOREACH(dpus, dpu) {
+        
+        uint32_t row_start = dpu_idx * dpu_rows_a_max;
+        uint32_t dpu_rows_a_actual = (row_start >= rows_a) ? 0
+                                   : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start)
+                                   : dpu_rows_a_max;
+
+        if(dpu_rows_a_actual) {
+            uint32_t elems_c = dpu_rows_a_actual * cols_b;
+            uint32_t bytes_c = elems_c * sizeof(double);
+
+            double *C_chunk = (double*)malloc(bytes_c);
+
+            DPU_ASSERT(dpu_copy_from(dpu, "C_chunk", 0, C_chunk, bytes_c));
+
+            for(int r=0; r<dpu_rows_a_actual; ++r) {
+                unsigned int global_row = row_start + r;
+                memcpy(&C[global_row * cols_b], &C_chunk[r * cols_b], cols_b*sizeof(double));
+            }
+
+            free(C_chunk);
+        }
+
+        dpu_idx++;
+    }
+}
diff --git a/src/host/matrix.c b/src/host/matrix.c
index 22cfb28..017d014 100644
--- a/src/host/matrix.c
+++ b/src/host/matrix.c
@@ -1,73 +1,16 @@
-#include <assert.h>
-#include <dpu.h>
-#include <dpu_log.h>
 #include "mlp.h"
 #include "upmem.h"
 
-struct dpu_set_t dpus, dpu;
-
 void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
 {
 #ifdef UPMEM
+    init_dpus();
     multiply_matrix_upmem(A, B, C, rows_a, cols_a, cols_b);
 #else
     multiply_matrix_naive(A, B, C, rows_a, cols_a, cols_b);
 #endif
 }
 
-void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
-{
-    assert(TILE_SIZE / NUM_DPU <= ROWS_A_PER_DPU_MAX);
-
-    double tileA[TILE_SIZE][TILE_SIZE];
-    double tileB[TILE_SIZE][TILE_SIZE];
-    double tileC[TILE_SIZE][TILE_SIZE];
-
-    for(int i=0; i<rows_a; ++i) {
-        for(int j=0; j<cols_b; ++j) {
-            C[i * cols_b + j] = 0;
-        }
-    }
-
-    DPU_ASSERT(dpu_alloc(NUM_DPU, NULL, &dpus));
-    DPU_ASSERT(dpu_load(dpus, DPU_BINARY_PATH, NULL));
-
-    for(int ii=0; ii<rows_a; ii+=TILE_SIZE) {
-        for(int jj=0; jj<cols_b; jj+=TILE_SIZE) {
-            for(int kk=0; kk<cols_a; kk+=TILE_SIZE) {
-    
-                for(int i=0; i<TILE_SIZE; ++i) {
-                    for(int j=0; j<TILE_SIZE; ++j) {
-                        if(ii + i < rows_a && kk + j < cols_a) {
-                            tileA[i][j] = A[(ii + i) * cols_a + (kk + j)];
-                        } else {
-                            tileA[i][j] = 0;
-                        }
-    
-                        if(kk + i < cols_a && jj + j < cols_b) {
-                            tileB[i][j] = B[(kk + i) * cols_b + (jj + j)];
-                        } else {
-                            tileB[i][j] = 0;
-                        }
-                    }
-                }
-    
-                process_tile_upmem(&tileA[0][0], &tileB[0][0], &tileC[0][0], TILE_SIZE, TILE_SIZE, TILE_SIZE);
-    
-                for(int row=0; row<TILE_SIZE; row++) {
-                    for(int col=0; col<TILE_SIZE; col++) {
-                        if(ii + row<rows_a && jj + col<cols_b) {
-                            C[(ii + row) * cols_b + (jj + col)] += tileC[row][col];
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    DPU_ASSERT(dpu_free(dpus));
-}
-
 void multiply_matrix_naive(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
 {
     for(int i=0; i<rows_a; i++) {
@@ -81,81 +24,6 @@ void multiply_matrix_naive(const double *A, const double *B, double *C, int rows
     }
 }
 
-void process_tile_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
-{
-    assert(rows_a <= ROWS_A_MAX);
-    assert(cols_a <= COLS_A_MAX);
-    assert(cols_b <= COLS_B_MAX);
-    
-    unsigned int bytes_b = cols_a * cols_b * sizeof(double);
-    DPU_ASSERT(dpu_broadcast_to(dpus, "B_whole", 0, B, bytes_b, DPU_XFER_DEFAULT));
-
-    uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU;
-
-    uint32_t dpu_idx = 0;
-    DPU_FOREACH(dpus, dpu) {
-
-        uint32_t row_start = dpu_idx * dpu_rows_a_max;
-        uint32_t dpu_rows_a_actual = (row_start >= rows_a) ? 0
-                                   : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start)
-                                   : dpu_rows_a_max;
-
-        dpu_args_t args = {
-            .rows_a = dpu_rows_a_actual,
-            .cols_a = cols_a,
-            .cols_b = cols_b
-        };
-
-        DPU_ASSERT(dpu_copy_to(dpu, "DPU_INPUT_ARGS", 0, &args, sizeof(args)));
-
-        if(dpu_rows_a_actual) {
-            uint32_t elems_a = dpu_rows_a_actual * cols_a;
-            uint32_t bytes_a = elems_a * sizeof(double);
-
-            double *A_chunk = (double*)malloc(bytes_a);
-
-            for(int r=0; r<dpu_rows_a_actual; ++r) {
-                unsigned int global_row = row_start + r;
-                memcpy(&A_chunk[r*cols_a], &A[global_row*cols_a], cols_a*sizeof(double));
-            }
-
-            DPU_ASSERT(dpu_copy_to(dpu, "A_chunk", 0, A_chunk, bytes_a));
-            free(A_chunk);
-        }
-
-        dpu_idx++;
-    }
-
-    DPU_ASSERT(dpu_launch(dpus, DPU_SYNCHRONOUS));
-
-    dpu_idx = 0;
-    DPU_FOREACH(dpus, dpu) {
-        
-        uint32_t row_start = dpu_idx * dpu_rows_a_max;
-        uint32_t dpu_rows_a_actual = (row_start >= rows_a) ? 0
-                                   : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start)
-                                   : dpu_rows_a_max;
-
-        if(dpu_rows_a_actual) {
-            uint32_t elems_c = dpu_rows_a_actual * cols_b;
-            uint32_t bytes_c = elems_c * sizeof(double);
-
-            double *C_chunk = (double*)malloc(bytes_c);
-
-            DPU_ASSERT(dpu_copy_from(dpu, "C_chunk", 0, C_chunk, bytes_c));
-
-            for(int r=0; r<dpu_rows_a_actual; ++r) {
-                unsigned int global_row = row_start + r;
-                memcpy(&C[global_row * cols_b], &C_chunk[r * cols_b], cols_b*sizeof(double));
-            }
-
-            free(C_chunk);
-        }
-
-        dpu_idx++;
-    }
-}
-
 void transpose_matrix(const double* A, double *C, int rows, int cols)
 {
     for(int i=0; i<rows; i++) {
diff --git a/src/host/mlp.c b/src/host/mlp.c
index 6a6036b..658f49c 100644
--- a/src/host/mlp.c
+++ b/src/host/mlp.c
@@ -1,5 +1,6 @@
 #include "mlp.h"
 #include "mnist.h"
+#include "upmem.h"
 
 unsigned int rseed = 42;
 
@@ -141,6 +142,11 @@ int main()
     }
 #endif
 
+    // free DPUs if UPMEM was deployed
+    if(upmem_initialized) {
+        free_dpus();
+    }
+
     // memory cleanup before termination
     free_double_matrix(samples, NUM_TRAIN_SAMPLES);
     free_double_matrix(labels, NUM_TRAIN_SAMPLES);
diff --git a/tests/test_matrix.c b/tests/test_matrix.c
index b77650a..699646e 100644
--- a/tests/test_matrix.c
+++ b/tests/test_matrix.c
@@ -1,5 +1,6 @@
 #include "mlp.h"
 #include "test.h"
+#include "upmem.h"
 
 int test_multiply_matrix()
 {
@@ -26,7 +27,9 @@ int test_multiply_matrix()
 
     multiply_matrix_naive(matrixA, matrixB, matrixC, 2, 3, 2);
 
+    init_dpus();
     multiply_matrix_upmem(matrixA, matrixB, matrixD, 2, 3, 2);
+    free_dpus();
 
     for(int i=0; i<2*2; i++) {
         printf("%lf ", matrixC[i]);

From 755cd7ba823f581199fa0bb0098f8dcd50cae49e Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Sat, 22 Nov 2025 13:38:46 +0100
Subject: [PATCH 20/32] Parallelize DPU workload through tasklets

---
 Makefile              | 2 +-
 include/upmem.h       | 2 +-
 src/dpu/dpu_program.c | 5 ++++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 6b1b21a..5c9f6da 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 DPU_UPMEM_CLANG = dpu-upmem-dpurte-clang
-DPU_UPMEM_CFLAGS += 
+DPU_UPMEM_CFLAGS += -DNR_TASKLETS=4
 CFLAGS += -std=c99 -Iinclude -D_GNU_SOURCE -DVERBOSE -DDEBUG -DBATCH_SIZE=2 -DMAX_EPOCH=2 -DNUM_TRAIN_SAMPLES=8
 FILES_TO_DELETE = build/
 
diff --git a/include/upmem.h b/include/upmem.h
index 4b90e89..47a32d5 100644
--- a/include/upmem.h
+++ b/include/upmem.h
@@ -16,7 +16,7 @@
 #endif
 
 #ifndef TILE_SIZE
-#define TILE_SIZE 128
+#define TILE_SIZE 256
 #endif
 
 #define ROWS_A_PER_DPU_MAX ((ROWS_A_MAX + NUM_DPU - 1) / NUM_DPU)
diff --git a/src/dpu/dpu_program.c b/src/dpu/dpu_program.c
index 193587e..fe3f470 100644
--- a/src/dpu/dpu_program.c
+++ b/src/dpu/dpu_program.c
@@ -18,7 +18,10 @@ int main()
     if(!rows_a)
         return 0;
 
-    for(int i=0; i<rows_a; ++i) {
+    int chunk = rows_a / NR_TASKLETS;
+    int row_start = chunk * me();
+
+    for(int i=row_start; i<(row_start+chunk); ++i) {
         for(int j=0; j<cols_b; ++j) {
             double sum = 0;
             for(int k=0; k<cols_a; ++k) {

From 9c632ae6b2f84a417ec5d32bb1a57b571b3f79f4 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Sat, 22 Nov 2025 14:48:53 +0100
Subject: [PATCH 21/32] Simplify macro scheme

---
 include/upmem.h       | 10 ++--------
 src/dpu/dpu_program.c |  6 +++---
 src/host/dpu_host.c   |  6 ------
 3 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/include/upmem.h b/include/upmem.h
index 47a32d5..24df7a8 100644
--- a/include/upmem.h
+++ b/include/upmem.h
@@ -3,24 +3,18 @@
 
 #include <stdint.h>
 
-#define ROWS_A_MAX 320
-#define COLS_A_MAX 320
-#define COLS_B_MAX 320
-
 #ifndef DPU_BINARY_PATH
 #define DPU_BINARY_PATH "build/dpu_program"
 #endif
 
 #ifndef NUM_DPU
-#define NUM_DPU 64
+#define NUM_DPU 8
 #endif
 
 #ifndef TILE_SIZE
-#define TILE_SIZE 256
+#define TILE_SIZE 32
 #endif
 
-#define ROWS_A_PER_DPU_MAX ((ROWS_A_MAX + NUM_DPU - 1) / NUM_DPU)
-
 typedef struct {
     uint32_t rows_a;
     uint32_t cols_a;
diff --git a/src/dpu/dpu_program.c b/src/dpu/dpu_program.c
index fe3f470..32697e3 100644
--- a/src/dpu/dpu_program.c
+++ b/src/dpu/dpu_program.c
@@ -2,9 +2,9 @@
 #include <defs.h>
 #include "upmem.h"
 
-__mram_noinit double A_chunk[ROWS_A_PER_DPU_MAX * COLS_A_MAX];
-__mram_noinit double B_whole[COLS_A_MAX * COLS_B_MAX];
-__mram_noinit double C_chunk[ROWS_A_PER_DPU_MAX * COLS_B_MAX];
+__mram_noinit double A_chunk[TILE_SIZE * TILE_SIZE];
+__mram_noinit double B_whole[TILE_SIZE * TILE_SIZE];
+__mram_noinit double C_chunk[TILE_SIZE * TILE_SIZE];
 
 __host dpu_args_t DPU_INPUT_ARGS;
 
diff --git a/src/host/dpu_host.c b/src/host/dpu_host.c
index e6a767b..e923e98 100644
--- a/src/host/dpu_host.c
+++ b/src/host/dpu_host.c
@@ -13,8 +13,6 @@ void free_dpus()
 void init_dpus()
 {
     if(!upmem_initialized) {
-        assert(TILE_SIZE / NUM_DPU <= ROWS_A_PER_DPU_MAX);
-
         DPU_ASSERT(dpu_alloc(NUM_DPU, NULL, &dpus));
         DPU_ASSERT(dpu_load(dpus, DPU_BINARY_PATH, NULL));
 
@@ -70,10 +68,6 @@ void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows
 
 void process_tile_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
 {
-    assert(rows_a <= ROWS_A_MAX);
-    assert(cols_a <= COLS_A_MAX);
-    assert(cols_b <= COLS_B_MAX);
-    
     unsigned int bytes_b = cols_a * cols_b * sizeof(double);
     DPU_ASSERT(dpu_broadcast_to(dpus, "B_whole", 0, B, bytes_b, DPU_XFER_DEFAULT));
 

From c1d1b3e30615dfb317a058478c89b837ecffb68d Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Sat, 22 Nov 2025 16:19:10 +0100
Subject: [PATCH 22/32] Move from double to single-precision float

---
 include/mlp.h                           | 36 ++++++++++++-------------
 include/upmem.h                         |  4 +--
 src/dpu/dpu_program.c                   |  8 +++---
 src/host/accumulate_layer_gradients.c   |  6 ++---
 src/host/activation.c                   |  8 +++---
 src/host/apply_gradients.c              |  8 +++---
 src/host/dpu_host.c                     | 24 ++++++++---------
 src/host/drand.c                        |  4 +--
 src/host/get_delta.c                    | 10 +++----
 src/host/get_total_loss.c               |  8 +++---
 src/host/get_y.c                        |  6 ++---
 src/host/get_z.c                        |  6 ++---
 src/host/init_layer.c                   |  4 +--
 src/host/init_neuron.c                  | 10 +++----
 src/host/matrix.c                       |  8 +++---
 src/host/mlp.c                          | 36 ++++++++++++-------------
 src/host/sse.c                          |  8 +++---
 src/host/utils.c                        | 10 +++----
 tests/test_accumulate_layer_gradients.c |  4 +--
 tests/test_activation.c                 |  8 +++---
 tests/test_drand.c                      |  2 +-
 tests/test_get_delta.c                  |  8 +++---
 tests/test_get_y.c                      | 12 ++++-----
 tests/test_get_z.c                      | 10 +++----
 tests/test_init_layer.c                 |  8 +++---
 tests/test_init_network.c               |  2 +-
 tests/test_init_neuron.c                |  4 +--
 tests/test_matrix.c                     | 28 +++++++++----------
 tests/test_sse.c                        |  6 ++---
 29 files changed, 148 insertions(+), 148 deletions(-)

diff --git a/include/mlp.h b/include/mlp.h
index 9ef6cdf..081cea9 100644
--- a/include/mlp.h
+++ b/include/mlp.h
@@ -35,13 +35,13 @@ extern unsigned int rseed;
 
 typedef struct {
     int num_weights;
-    double *w, *lw;
-    double *batch_dw;
+    float *w, *lw;
+    float *batch_dw;
 } NEURON;
 
 typedef struct {
     int num_neurons;
-    double *inputs, *deltas;
+    float *inputs, *deltas;
     NEURON *n;
 } LAYER;
 
@@ -50,23 +50,23 @@ typedef struct {
     LAYER *l;
 } NETWORK;
 
-void accumulate_layer_gradients(LAYER *l, int batch_size, double learning_rate);
+void accumulate_layer_gradients(LAYER *l, int batch_size, float learning_rate);
 void apply_gradients(NETWORK *n, int batch_size);
-double drand();
-double get_activation(double x);
-double get_activation_derivative(double x);
-double *get_delta(NETWORK *n, double *samples, double *ideal, int layer_index);
-double *get_total_loss(NETWORK *n, double **samples, double **ideal, int nsamples);
-double *get_y(NETWORK *n, int layer_index, double *sample);
-double *get_z(NETWORK *n, int layer_index, double *sample);
+float drand();
+float get_activation(float x);
+float get_activation_derivative(float x);
+float *get_delta(NETWORK *n, float *samples, float *ideal, int layer_index);
+float *get_total_loss(NETWORK *n, float **samples, float **ideal, int nsamples);
+float *get_y(NETWORK *n, int layer_index, float *sample);
+float *get_z(NETWORK *n, int layer_index, float *sample);
 LAYER *init_layer(int num_neurons, int num_weights_per_neuron, int batch_size);
 NETWORK *init_network(int num_inputs, int num_layers, int *num_inputs_per_layer, int batch_size);
 NEURON *init_neuron(int num_weights);
-void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b);
-void multiply_matrix_naive(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b);
+void multiply_matrix(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b);
+void multiply_matrix_naive(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b);
 uint8_t **read_image_data(const char *filename, int *num_rows, const int num_cols);
-double sse(double *real, double *ideal, int length);
-void transpose_matrix(const double *A, double *C, int rows, int cols);
+float sse(float *real, float *ideal, int length);
+void transpose_matrix(const float *A, float *C, int rows, int cols);
 
 //
 // utility functions
@@ -76,10 +76,10 @@ void free_layer(LAYER *l);
 void free_network(NETWORK *n);
 void free_neuron(NEURON *n);
 
-void free_double_matrix(double **addr, int nrows);
+void free_float_matrix(float **addr, int nrows);
 void free_uint8_matrix(uint8_t **addr, int nrows);
 
-void print_double_matrix(double **addr, int nrows, int ncols);
-void print_double_vector(double *addr, int nrows);
+void print_float_matrix(float **addr, int nrows, int ncols);
+void print_float_vector(float *addr, int nrows);
 
 #endif
diff --git a/include/upmem.h b/include/upmem.h
index 24df7a8..726ee31 100644
--- a/include/upmem.h
+++ b/include/upmem.h
@@ -25,7 +25,7 @@ extern int upmem_initialized;
 
 void free_dpus();
 void init_dpus();
-void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b);
-void process_tile_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b);
+void multiply_matrix_upmem(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b);
+void process_tile_upmem(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b);
 
 #endif
diff --git a/src/dpu/dpu_program.c b/src/dpu/dpu_program.c
index 32697e3..1155bac 100644
--- a/src/dpu/dpu_program.c
+++ b/src/dpu/dpu_program.c
@@ -2,9 +2,9 @@
 #include <defs.h>
 #include "upmem.h"
 
-__mram_noinit double A_chunk[TILE_SIZE * TILE_SIZE];
-__mram_noinit double B_whole[TILE_SIZE * TILE_SIZE];
-__mram_noinit double C_chunk[TILE_SIZE * TILE_SIZE];
+__mram_noinit float A_chunk[TILE_SIZE * TILE_SIZE];
+__mram_noinit float B_whole[TILE_SIZE * TILE_SIZE];
+__mram_noinit float C_chunk[TILE_SIZE * TILE_SIZE];
 
 __host dpu_args_t DPU_INPUT_ARGS;
 
@@ -23,7 +23,7 @@ int main()
 
     for(int i=row_start; i<(row_start+chunk); ++i) {
         for(int j=0; j<cols_b; ++j) {
-            double sum = 0;
+            float sum = 0;
             for(int k=0; k<cols_a; ++k) {
                 sum += A_chunk[i * cols_a + k] * B_whole[k * cols_b + j];
             }
diff --git a/src/host/accumulate_layer_gradients.c b/src/host/accumulate_layer_gradients.c
index 8337cf1..852a701 100644
--- a/src/host/accumulate_layer_gradients.c
+++ b/src/host/accumulate_layer_gradients.c
@@ -1,6 +1,6 @@
 #include "mlp.h"
 
-void accumulate_layer_gradients(LAYER *l, int batch_size, double learning_rate)
+void accumulate_layer_gradients(LAYER *l, int batch_size, float learning_rate)
 {
     if(batch_size <= 0)
         return;
@@ -8,12 +8,12 @@ void accumulate_layer_gradients(LAYER *l, int batch_size, double learning_rate)
     int num_neurons = l->num_neurons;
     int num_weights = l->n->num_weights;
 
-    double *gradient = (double *) malloc (num_neurons * num_weights * sizeof(double));
+    float *gradient = (float *) malloc (num_neurons * num_weights * sizeof(float));
     if(!gradient) {
         return;
     }
 
-    double *deltas_T = (double*) malloc (num_neurons * batch_size * sizeof(double));
+    float *deltas_T = (float*) malloc (num_neurons * batch_size * sizeof(float));
     if(!deltas_T) {
         free(gradient);
         return;
diff --git a/src/host/activation.c b/src/host/activation.c
index eeaaee7..5345ec6 100644
--- a/src/host/activation.c
+++ b/src/host/activation.c
@@ -1,11 +1,11 @@
 #include "mlp.h"
 
-double get_activation(double x)
+float get_activation(float x)
 {
-    return tanh(x);
+    return tanhf(x);
 }
 
-double get_activation_derivative(double x)
+float get_activation_derivative(float x)
 {
-    return 1.0 / pow(cosh(x), 2);
+    return 1.0 / powf(coshf(x), 2);
 }
\ No newline at end of file
diff --git a/src/host/apply_gradients.c b/src/host/apply_gradients.c
index 4bc143b..ede95e7 100644
--- a/src/host/apply_gradients.c
+++ b/src/host/apply_gradients.c
@@ -15,11 +15,11 @@ void apply_gradients(NETWORK *n, int batch_size)
 
             for(int k=0; k<np->num_weights; k++)    // do the following for all weights "k" of said neuron:
             {
-                double previous_weight_update = np->w[k] - np->lw[k];
-                double momentum_term = MOMENTUM * previous_weight_update;
-                double gradient_term = np->batch_dw[k] / (double) batch_size;
+                float previous_weight_update = np->w[k] - np->lw[k];
+                float momentum_term = MOMENTUM * previous_weight_update;
+                float gradient_term = np->batch_dw[k] / (float) batch_size;
 
-                double old_weight = np->w[k];
+                float old_weight = np->w[k];
 
                 np->lw[k] = old_weight;
                 np->w[k] = old_weight + gradient_term + momentum_term;
diff --git a/src/host/dpu_host.c b/src/host/dpu_host.c
index e923e98..ac203a0 100644
--- a/src/host/dpu_host.c
+++ b/src/host/dpu_host.c
@@ -20,11 +20,11 @@ void init_dpus()
     }
 }
 
-void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
+void multiply_matrix_upmem(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b)
 {
-    double tileA[TILE_SIZE][TILE_SIZE];
-    double tileB[TILE_SIZE][TILE_SIZE];
-    double tileC[TILE_SIZE][TILE_SIZE];
+    float tileA[TILE_SIZE][TILE_SIZE];
+    float tileB[TILE_SIZE][TILE_SIZE];
+    float tileC[TILE_SIZE][TILE_SIZE];
 
     for(int i=0; i<rows_a; ++i) {
         for(int j=0; j<cols_b; ++j) {
@@ -66,9 +66,9 @@ void multiply_matrix_upmem(const double *A, const double *B, double *C, int rows
     }
 }
 
-void process_tile_upmem(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
+void process_tile_upmem(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b)
 {
-    unsigned int bytes_b = cols_a * cols_b * sizeof(double);
+    unsigned int bytes_b = cols_a * cols_b * sizeof(float);
     DPU_ASSERT(dpu_broadcast_to(dpus, "B_whole", 0, B, bytes_b, DPU_XFER_DEFAULT));
 
     uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU;
@@ -91,13 +91,13 @@ void process_tile_upmem(const double *A, const double *B, double *C, int rows_a,
 
         if(dpu_rows_a_actual) {
             uint32_t elems_a = dpu_rows_a_actual * cols_a;
-            uint32_t bytes_a = elems_a * sizeof(double);
+            uint32_t bytes_a = elems_a * sizeof(float);
 
-            double *A_chunk = (double*)malloc(bytes_a);
+            float *A_chunk = (float*)malloc(bytes_a);
 
             for(int r=0; r<dpu_rows_a_actual; ++r) {
                 unsigned int global_row = row_start + r;
-                memcpy(&A_chunk[r*cols_a], &A[global_row*cols_a], cols_a*sizeof(double));
+                memcpy(&A_chunk[r*cols_a], &A[global_row*cols_a], cols_a*sizeof(float));
             }
 
             DPU_ASSERT(dpu_copy_to(dpu, "A_chunk", 0, A_chunk, bytes_a));
@@ -119,15 +119,15 @@ void process_tile_upmem(const double *A, const double *B, double *C, int rows_a,
 
         if(dpu_rows_a_actual) {
             uint32_t elems_c = dpu_rows_a_actual * cols_b;
-            uint32_t bytes_c = elems_c * sizeof(double);
+            uint32_t bytes_c = elems_c * sizeof(float);
 
-            double *C_chunk = (double*)malloc(bytes_c);
+            float *C_chunk = (float*)malloc(bytes_c);
 
             DPU_ASSERT(dpu_copy_from(dpu, "C_chunk", 0, C_chunk, bytes_c));
 
             for(int r=0; r<dpu_rows_a_actual; ++r) {
                 unsigned int global_row = row_start + r;
-                memcpy(&C[global_row * cols_b], &C_chunk[r * cols_b], cols_b*sizeof(double));
+                memcpy(&C[global_row * cols_b], &C_chunk[r * cols_b], cols_b*sizeof(float));
             }
 
             free(C_chunk);
diff --git a/src/host/drand.c b/src/host/drand.c
index 05f07a4..88f603a 100644
--- a/src/host/drand.c
+++ b/src/host/drand.c
@@ -1,7 +1,7 @@
 #include "mlp.h"
 
-double drand()
+float drand()
 {
-    return (double)rand()/(double)RAND_MAX;
+    return (float)rand()/(float)RAND_MAX;
 }
 
diff --git a/src/host/get_delta.c b/src/host/get_delta.c
index 450129c..4126b38 100644
--- a/src/host/get_delta.c
+++ b/src/host/get_delta.c
@@ -1,16 +1,16 @@
 #include "mlp.h"
 
-double *get_delta(NETWORK *n, double* sample, double* ideal, int layer_index)
+float *get_delta(NETWORK *n, float* sample, float* ideal, int layer_index)
 {
     int layer_size = (n->l+layer_index)->num_neurons;
 
-    double *d = (double*) malloc (sizeof(double) * layer_size);
+    float *d = (float*) malloc (sizeof(float) * layer_size);
     if(!d) {
         fprintf(stderr, "Error 10010\n");
         return NULL;
     }
 
-    double *z = get_z(n, layer_index, sample);
+    float *z = get_z(n, layer_index, sample);
     if(!z) {
         fprintf(stderr, "Error 10011\n");
         free(d);
@@ -21,7 +21,7 @@ double *get_delta(NETWORK *n, double* sample, double* ideal, int layer_index)
 
     if(is_current_layer_last_layer)
     {
-        double *y = get_y(n, layer_index, sample);
+        float *y = get_y(n, layer_index, sample);
         if(!y) {
             fprintf(stderr, "Error 10012\n");
             free(d);
@@ -36,7 +36,7 @@ double *get_delta(NETWORK *n, double* sample, double* ideal, int layer_index)
     }
     else
     {
-        double *next_d = get_delta(n, sample, ideal, layer_index+1);
+        float *next_d = get_delta(n, sample, ideal, layer_index+1);
         if(!next_d) {
             fprintf(stderr, "Error 10013\n");
             free(d);
diff --git a/src/host/get_total_loss.c b/src/host/get_total_loss.c
index 8bf7f2c..c386536 100644
--- a/src/host/get_total_loss.c
+++ b/src/host/get_total_loss.c
@@ -1,8 +1,8 @@
 #include "mlp.h"
 
-double *get_total_loss(NETWORK *n, double **samples, double **ideal, int nsamples)
+float *get_total_loss(NETWORK *n, float **samples, float **ideal, int nsamples)
 {
-    double *total_loss = (double*) malloc (sizeof(double));
+    float *total_loss = (float*) malloc (sizeof(float));
     if(!total_loss) {
         fprintf(stderr, "Error 10007\n");
         return NULL;
@@ -13,13 +13,13 @@ double *get_total_loss(NETWORK *n, double **samples, double **ideal, int nsample
     LAYER *last_layer = n->l+(n->num_layers-1);
 
     for(int i=0; i<nsamples; ++i) {
-        double *y = get_y(n, n->num_layers-1, samples[i]);
+        float *y = get_y(n, n->num_layers-1, samples[i]);
         if(!y) {
             fprintf(stderr, "Error 10008\n");
             free(total_loss);
             return NULL;
         }
-        *total_loss += sse(y, ideal[i], last_layer->num_neurons) / (double)nsamples;
+        *total_loss += sse(y, ideal[i], last_layer->num_neurons) / (float)nsamples;
         free(y);
     }
 
diff --git a/src/host/get_y.c b/src/host/get_y.c
index 3e5b70e..5931c62 100644
--- a/src/host/get_y.c
+++ b/src/host/get_y.c
@@ -2,7 +2,7 @@
 
 // preactivation -> get_y -> activation
 
-double *get_y(NETWORK *n, int layer_index, double *sample)
+float *get_y(NETWORK *n, int layer_index, float *sample)
 {
     LAYER *current_layer = n->l+layer_index;
     int is_current_layer_last_layer = (n->num_layers == layer_index + 1);
@@ -11,9 +11,9 @@ double *get_y(NETWORK *n, int layer_index, double *sample)
     if(!is_current_layer_last_layer) // add bias node
         y_size++;
 
-    double *z = get_z(n, layer_index, sample);
+    float *z = get_z(n, layer_index, sample);
 
-    double *y = (double *) malloc (sizeof(double)*y_size);
+    float *y = (float *) malloc (sizeof(float)*y_size);
     if(!y) {
         fprintf(stderr, "Error 10006\n");
         return NULL;
diff --git a/src/host/get_z.c b/src/host/get_z.c
index ad7a08d..466ee1a 100644
--- a/src/host/get_z.c
+++ b/src/host/get_z.c
@@ -2,20 +2,20 @@
 
 // samples -> get_z -> preactivation
 
-double *get_z(NETWORK *n, int layer_index, double *sample)
+float *get_z(NETWORK *n, int layer_index, float *sample)
 {
     LAYER *current_layer = n->l+layer_index;
     int z_neuroncount = current_layer->num_neurons;
     int z_weightcount = current_layer->n->num_weights;
     int is_first_layer = layer_index == 0;
 
-    double *z = (double *) malloc (sizeof(double)* z_neuroncount);
+    float *z = (float *) malloc (sizeof(float)* z_neuroncount);
     if(!z) {
         fprintf(stderr, "Error 10005\n");
         return NULL;
     }
 
-    double *z_prev = is_first_layer ? sample : get_y(n, layer_index-1, sample);
+    float *z_prev = is_first_layer ? sample : get_y(n, layer_index-1, sample);
 
     for(size_t i=0; i<z_neuroncount; ++i) {
         z[i] = 0;
diff --git a/src/host/init_layer.c b/src/host/init_layer.c
index 29cc334..1828e49 100644
--- a/src/host/init_layer.c
+++ b/src/host/init_layer.c
@@ -9,13 +9,13 @@ LAYER *init_layer(int num_neurons, int num_weights_per_neuron, int batch_size)
 
     l->num_neurons = num_neurons;
 
-    l->inputs = (double*) malloc (batch_size * num_weights_per_neuron * sizeof(double));
+    l->inputs = (float*) malloc (batch_size * num_weights_per_neuron * sizeof(float));
     if(!l->inputs) {
         free(l);
         return NULL;
     }
 
-    l->deltas = (double*) malloc (batch_size * num_neurons * sizeof(double));
+    l->deltas = (float*) malloc (batch_size * num_neurons * sizeof(float));
     if(!l->deltas) {
         free(l->inputs);
         free(l);
diff --git a/src/host/init_neuron.c b/src/host/init_neuron.c
index b5506f3..450677e 100644
--- a/src/host/init_neuron.c
+++ b/src/host/init_neuron.c
@@ -9,20 +9,20 @@ NEURON *init_neuron(int num_weights)
     
     n->num_weights = num_weights;
 
-    n->w    = (double *) malloc (sizeof(double) * n->num_weights);
+    n->w    = (float *) malloc (sizeof(float) * n->num_weights);
     if(!n->w) {
         free(n);
         return NULL;
     }
 
-    n->lw   = (double *) malloc (sizeof(double) * n->num_weights);
+    n->lw   = (float *) malloc (sizeof(float) * n->num_weights);
     if(!n->lw) {
         free(n->w);
         free(n);
         return NULL;
     }
 
-    n->batch_dw = (double *) malloc (sizeof(double) * n->num_weights);
+    n->batch_dw = (float *) malloc (sizeof(float) * n->num_weights);
     if(!n->batch_dw) {
         free(n->lw);
         free(n->w);
@@ -30,11 +30,11 @@ NEURON *init_neuron(int num_weights)
         return NULL;
     }
 
-    double limit = 1.0/sqrt((double) num_weights);
+    float limit = 1.0/sqrt((float) num_weights);
 
     for(int i=0; i<num_weights; i++)
     {
-        double rand_unit = (double)rand() / (double)RAND_MAX;
+        float rand_unit = (float)rand() / (float)RAND_MAX;
         n->w[i] = (rand_unit * 2.0 - 1.0) * limit;
         n->lw[i] = n->w[i];
         n->batch_dw[i] = 0;
diff --git a/src/host/matrix.c b/src/host/matrix.c
index 017d014..967ba10 100644
--- a/src/host/matrix.c
+++ b/src/host/matrix.c
@@ -1,7 +1,7 @@
 #include "mlp.h"
 #include "upmem.h"
 
-void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
+void multiply_matrix(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b)
 {
 #ifdef UPMEM
     init_dpus();
@@ -11,11 +11,11 @@ void multiply_matrix(const double *A, const double *B, double *C, int rows_a, in
 #endif
 }
 
-void multiply_matrix_naive(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
+void multiply_matrix_naive(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b)
 {
     for(int i=0; i<rows_a; i++) {
         for(int j=0; j<cols_b; j++) {
-            double sum = 0.0;
+            float sum = 0.0;
             for(int k=0; k<cols_a; k++) {
                 sum += A[i*cols_a+k] * B[k*cols_b+j];
             }
@@ -24,7 +24,7 @@ void multiply_matrix_naive(const double *A, const double *B, double *C, int rows
     }
 }
 
-void transpose_matrix(const double* A, double *C, int rows, int cols)
+void transpose_matrix(const float* A, float *C, int rows, int cols)
 {
     for(int i=0; i<rows; i++) {
         for(int j=0; j<cols; j++) {
diff --git a/src/host/mlp.c b/src/host/mlp.c
index 658f49c..6ef328b 100644
--- a/src/host/mlp.c
+++ b/src/host/mlp.c
@@ -21,16 +21,16 @@ int main()
         return 1;
     }
 
-    double **samples    = (double **) malloc (sizeof(double*)*NUM_TRAIN_SAMPLES);
-    double **labels     = (double **) malloc (sizeof(double*)*NUM_TRAIN_SAMPLES);
+    float **samples    = (float **) malloc (sizeof(float*)*NUM_TRAIN_SAMPLES);
+    float **labels     = (float **) malloc (sizeof(float*)*NUM_TRAIN_SAMPLES);
 
     uint8_t **sample_data   = read_image_data(TRAINING_SAMPLES_FILE, &sample_rows, NUM_FEATURES);
     uint8_t **label_data    = read_image_data(TRAINING_LABELS_FILE, &label_rows, 1);
 
     // save data into `samples` and `labels`
     for(size_t i=0; i<NUM_TRAIN_SAMPLES; ++i) {
-        *(samples+i)    = (double *) malloc (sizeof(double)*(NUM_FEATURES+1));
-        *(labels+i)     = (double *) malloc (sizeof(double)*NUM_LABELS);
+        *(samples+i)    = (float *) malloc (sizeof(float)*(NUM_FEATURES+1));
+        *(labels+i)     = (float *) malloc (sizeof(float)*NUM_LABELS);
         
         samples[i][0] = 0.0;   // bias
         for(size_t j=1; j<(NUM_FEATURES+1); ++j) {
@@ -48,11 +48,11 @@ int main()
 #ifdef DEBUG
     // print samples & labels to check if all is saved correctly into program memory
     printf("===== Samples =====\n\n");
-    print_double_matrix(samples, 2, NUM_FEATURES+1);
+    print_float_matrix(samples, 2, NUM_FEATURES+1);
     printf("\n");
 
     printf("===== Labels =====\n\n");
-    print_double_matrix(labels, 5, NUM_LABELS);
+    print_float_matrix(labels, 5, NUM_LABELS);
     printf("\n");
 
     printf("Starting training...\n\n");
@@ -62,9 +62,9 @@ int main()
 
     while(1) {
 
-        double learning_rate_epoch = LEARNING_RATE * pow(DECAY_RATE, epoch);
+        float learning_rate_epoch = LEARNING_RATE * powf(DECAY_RATE, epoch);
 
-        double *loss_prev = get_total_loss(n, samples, labels, NUM_TRAIN_SAMPLES);
+        float *loss_prev = get_total_loss(n, samples, labels, NUM_TRAIN_SAMPLES);
         if(!loss_prev) {
             fprintf(stderr, "Error 10014\n");
             return 1;
@@ -83,17 +83,17 @@ int main()
                 for(int j=n->num_layers-1; j>=0; --j) {
                     LAYER *lp = n->l+j;     // ptr to layer j of network n
 
-                    double *d = get_delta(n, samples[i], labels[i], j);
+                    float *d = get_delta(n, samples[i], labels[i], j);
 
-                    memcpy(lp->deltas+batch_ctr*lp->num_neurons, d, lp->num_neurons * sizeof(double));
+                    memcpy(lp->deltas+batch_ctr*lp->num_neurons, d, lp->num_neurons * sizeof(float));
 
-                    double *py = j ? get_y(n, j-1, samples[i]) : NULL;
+                    float *py = j ? get_y(n, j-1, samples[i]) : NULL;
                     if(j && !py) {
                         fprintf(stderr, "Error 10009\n");
                         return 1;
                     }
 
-                    memcpy(lp->inputs+batch_ctr*lp->n->num_weights, (j ? py : samples[i]), lp->n->num_weights * sizeof(double));
+                    memcpy(lp->inputs+batch_ctr*lp->n->num_weights, (j ? py : samples[i]), lp->n->num_weights * sizeof(float));
 
                     free(d);
                     if(j) free(py);
@@ -106,18 +106,18 @@ int main()
             apply_gradients(n, actual_batch_size);
         }
 
-        double *loss_new = get_total_loss(n, samples, labels, NUM_TRAIN_SAMPLES);
+        float *loss_new = get_total_loss(n, samples, labels, NUM_TRAIN_SAMPLES);
         if(!loss_new) {
             fprintf(stderr, "Error 10015\n");
             return 1;
         }
 
-        double loss_delta = fabs(*loss_new - *loss_prev);
+        float loss_delta = fabs(*loss_new - *loss_prev);
 
         epoch++;
         
 #ifdef VERBOSE
-        printf("Epoch %-3d --- Lost Delta = %.9lf --- Final Loss = %.6lf\n", epoch, loss_delta, *loss_new);
+        printf("Epoch %-3d --- Lost Delta = %.9f --- Final Loss = %.6f\n", epoch, loss_delta, *loss_new);
 #endif
 
         free(loss_prev);
@@ -135,7 +135,7 @@ int main()
         LAYER *lp = n->l+i;             // ptr to i-th layer of the network n
         for(int j=0; j<lp->num_neurons; j++) {
             NEURON *np = lp->n+j;       // ptr to j-th neuron of the i-th layer of network n
-            print_double_vector(np->w, np->num_weights);
+            print_float_vector(np->w, np->num_weights);
             printf("\n");
         }
         printf("\n\n");
@@ -148,8 +148,8 @@ int main()
     }
 
     // memory cleanup before termination
-    free_double_matrix(samples, NUM_TRAIN_SAMPLES);
-    free_double_matrix(labels, NUM_TRAIN_SAMPLES);
+    free_float_matrix(samples, NUM_TRAIN_SAMPLES);
+    free_float_matrix(labels, NUM_TRAIN_SAMPLES);
     free_network(n);
 
     return 0;
diff --git a/src/host/sse.c b/src/host/sse.c
index ab3477b..cf58db9 100644
--- a/src/host/sse.c
+++ b/src/host/sse.c
@@ -1,12 +1,12 @@
 #include "mlp.h"
 
-double sse(double *real, double *ideal, int length)
+float sse(float *real, float *ideal, int length)
 {
-    double sse = 0.0;   // Sum of squared errors
+    float sse = 0.0;   // Sum of squared errors
 
     for(size_t i=0; i<length; ++i) {
-        double raw_error = ideal[i]-real[i];
-        sse += pow(raw_error, 2);
+        float raw_error = ideal[i]-real[i];
+        sse += powf(raw_error, 2);
     }
 
     return sse;
diff --git a/src/host/utils.c b/src/host/utils.c
index 06721ef..b28b125 100644
--- a/src/host/utils.c
+++ b/src/host/utils.c
@@ -48,7 +48,7 @@ void free_neuron(NEURON *n)
     n->num_weights = 0;
 }
 
-void free_double_matrix(double **addr, int nrows)
+void free_float_matrix(float **addr, int nrows)
 {
     if(!addr)
         return;
@@ -68,19 +68,19 @@ void free_uint8_matrix(uint8_t **addr, int nrows)
     free(addr);
 }
 
-void print_double_matrix(double **addr, int nrows, int ncols)
+void print_float_matrix(float **addr, int nrows, int ncols)
 {
     for(size_t i=0; i<nrows; ++i)
     {
         for(size_t j=0; j<ncols; ++j)
-            printf("%.2lf ", addr[i][j]);
+            printf("%.2f ", addr[i][j]);
         printf("\n");
     }
 }
 
-void print_double_vector(double *addr, int nrows)
+void print_float_vector(float *addr, int nrows)
 {
     for(size_t i=0; i<nrows; ++i)
-        printf("%.2lf ", addr[i]);
+        printf("%.2f ", addr[i]);
     printf("\n");
 }
\ No newline at end of file
diff --git a/tests/test_accumulate_layer_gradients.c b/tests/test_accumulate_layer_gradients.c
index f1c75e5..77cfc4e 100644
--- a/tests/test_accumulate_layer_gradients.c
+++ b/tests/test_accumulate_layer_gradients.c
@@ -13,13 +13,13 @@ int test_accumulate_layer_gradients()
 
     // inputs is 1x5 matrix
     for(int i=0; i<1*5; i++)
-        first_layer->inputs[i] = ((double) rand() / (double) RAND_MAX) * 20;
+        first_layer->inputs[i] = ((float) rand() / (float) RAND_MAX) * 20;
 
     // deltas is a 1x4 identity matrix
     for(int i=0; i<1*4; i++)
         first_layer->deltas[i] = 1.0;
 
-    double batch_dw_ideal[4][5] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    float batch_dw_ideal[4][5] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 
     for(int i=0; i<4; i++)
         for(int j=0; j<5; j++)
diff --git a/tests/test_activation.c b/tests/test_activation.c
index 8998218..2d120bb 100644
--- a/tests/test_activation.c
+++ b/tests/test_activation.c
@@ -1,12 +1,12 @@
 #include "mlp.h"
 #include "test.h"
 
-int test_activation(double x)
+int test_activation(float x)
 {
-    double activation_result = get_activation(x);
-    double activation_derivative_result = get_activation_derivative(x);
+    float activation_result = get_activation(x);
+    float activation_derivative_result = get_activation_derivative(x);
 
-    double expected_activation_derivative = 1 - pow(activation_result, 2);
+    float expected_activation_derivative = 1 - powf(activation_result, 2);
 
     if(abs(activation_derivative_result - expected_activation_derivative) < 1e-5)
         return 1;
diff --git a/tests/test_drand.c b/tests/test_drand.c
index 1411e08..2771bb1 100644
--- a/tests/test_drand.c
+++ b/tests/test_drand.c
@@ -7,7 +7,7 @@ int test_drand()
 
     for(int i=0; i<10; i++)
     {
-        double test_value = drand();
+        float test_value = drand();
 
         test_pass_fail &= (test_value >= 0.0) && (test_value <= 1.0);
     }
diff --git a/tests/test_get_delta.c b/tests/test_get_delta.c
index b8a06c6..068f347 100644
--- a/tests/test_get_delta.c
+++ b/tests/test_get_delta.c
@@ -6,8 +6,8 @@ int test_get_delta()
     int test_pass_fail = 1;
 
     int num_neurons_per_layers[] = {3, 3};
-    double samples[] = {1, 1, 1, 1};
-    double ideals[] = {3, 3, 3, 3};
+    float samples[] = {1, 1, 1, 1};
+    float ideals[] = {3, 3, 3, 3};
 
     NETWORK *n = init_network(3, 2, num_neurons_per_layers, BATCH_SIZE);
 
@@ -28,7 +28,7 @@ int test_get_delta()
 
     // test last layer delta
 
-    double *d_last_layer = get_delta(n, samples, ideals, 1);
+    float *d_last_layer = get_delta(n, samples, ideals, 1);
 
     for(int i=0; i<3; i++)
     {
@@ -37,7 +37,7 @@ int test_get_delta()
 
     // test before-last layer delta
 
-    double *d_first_layer = get_delta(n, samples, ideals, 0);
+    float *d_first_layer = get_delta(n, samples, ideals, 0);
 
     for(int i=0; i<3; i++)
     {
diff --git a/tests/test_get_y.c b/tests/test_get_y.c
index 4274682..de7cb02 100644
--- a/tests/test_get_y.c
+++ b/tests/test_get_y.c
@@ -4,7 +4,7 @@
 int test_get_y()
 {
     int num_neurons_per_layers[] = {3, 3};
-    double samples[] = {1, 1, 1, 1};
+    float samples[] = {1, 1, 1, 1};
 
     NETWORK *n = init_network(3, 2, num_neurons_per_layers, BATCH_SIZE);
 
@@ -23,12 +23,12 @@ int test_get_y()
     n->l[0].n[2].w[2] = 0.0;
     n->l[0].n[2].w[3] = 0.0;
 
-    double *y = get_y(n, 0, samples);
-    double *z = get_z(n, 0, samples);
+    float *y = get_y(n, 0, samples);
+    float *z = get_z(n, 0, samples);
 
-    // printf("y[0] == %.2lf\n", y[0]);
-    // printf("y[1] == %.2lf\n", y[1]);
-    // printf("y[2] == %.2lf\n", y[2]);
+    // printf("y[0] == %.2f\n", y[0]);
+    // printf("y[1] == %.2f\n", y[1]);
+    // printf("y[2] == %.2f\n", y[2]);
 
     int test_pass_fail = (y[0] == 1)
                         && (y[1] == get_activation(z[0]))
diff --git a/tests/test_get_z.c b/tests/test_get_z.c
index be921c6..7244651 100644
--- a/tests/test_get_z.c
+++ b/tests/test_get_z.c
@@ -4,7 +4,7 @@
 int test_get_z()
 {
     int num_neurons_per_layers[] = {3, 3};
-    double samples[] = {1, 1, 1, 1};
+    float samples[] = {1, 1, 1, 1};
 
     NETWORK *n = init_network(3, 2, num_neurons_per_layers, BATCH_SIZE);
     
@@ -23,11 +23,11 @@ int test_get_z()
     n->l[0].n[2].w[2] = 0.0;
     n->l[0].n[2].w[3] = 0.0;
 
-    double *z = get_z(n, 0, samples);
+    float *z = get_z(n, 0, samples);
 
-    // printf("z[0] == %.2lf\n", z[0]);
-    // printf("z[1] == %.2lf\n", z[1]);
-    // printf("z[2] == %.2lf\n", z[2]);
+    // printf("z[0] == %.2f\n", z[0]);
+    // printf("z[1] == %.2f\n", z[1]);
+    // printf("z[2] == %.2f\n", z[2]);
 
     int test_pass_fail = (z[0] == 2) && (z[1] == 6) && (z[2] == -1);
 
diff --git a/tests/test_init_layer.c b/tests/test_init_layer.c
index 13b7ae6..761f331 100644
--- a/tests/test_init_layer.c
+++ b/tests/test_init_layer.c
@@ -6,10 +6,10 @@ int test_init_layer()
     LAYER *l = init_layer(3, 4, BATCH_SIZE);
 
     // printf("%d\n", l->num_neurons);
-    // printf("%lf\n", l->n[0].w[0]);
-    // printf("%lf\n", l->n[1].w[0]);
-    // printf("%lf\n", l->n[2].w[0]);
-    // printf("%lf\n", l->n[0].lw[0]);
+    // printf("%f\n", l->n[0].w[0]);
+    // printf("%f\n", l->n[1].w[0]);
+    // printf("%f\n", l->n[2].w[0]);
+    // printf("%f\n", l->n[0].lw[0]);
     // printf("%d\n", l->n[0].num_weights);
     // printf("%d\n", l->n[1].num_weights);
     // printf("%d\n", l->n[2].num_weights);
diff --git a/tests/test_init_network.c b/tests/test_init_network.c
index 375565f..2e5603b 100644
--- a/tests/test_init_network.c
+++ b/tests/test_init_network.c
@@ -12,7 +12,7 @@ int test_init_network()
     // printf("%d\n", n->l[1].num_neurons);
     // printf("%d\n", n->l[2].num_neurons);
     // printf("%d\n", n->l[0].n[0].num_weights);
-    // printf("%lf\n", n->l[0].n[0].lw[0]);
+    // printf("%f\n", n->l[0].n[0].lw[0]);
     // printf("%d\n", n->l[1].n[0].num_weights);
     // printf("%d\n", n->l[2].n[0].num_weights);
 
diff --git a/tests/test_init_neuron.c b/tests/test_init_neuron.c
index 61d0232..486548c 100644
--- a/tests/test_init_neuron.c
+++ b/tests/test_init_neuron.c
@@ -6,8 +6,8 @@ int test_init_neuron()
     NEURON *n = init_neuron(2);
 
     // printf("%d\n", n->num_weights);
-    // printf("%lf\n", n->w[0]);
-    // printf("%lf\n", n->lw[0]);
+    // printf("%f\n", n->w[0]);
+    // printf("%f\n", n->lw[0]);
 
     return (n->num_weights == 2) && (n->w[0] <= 1) && (n->w[0] >= -1) && (n->lw[0] == n->w[0]);
 }
diff --git a/tests/test_matrix.c b/tests/test_matrix.c
index 699646e..37e9308 100644
--- a/tests/test_matrix.c
+++ b/tests/test_matrix.c
@@ -6,23 +6,23 @@ int test_multiply_matrix()
 {
     int test_result_pass_fail = 1;
 
-    double matrixA[2*3] = {1.0, 2.0, 3.0, 
+    float matrixA[2*3] = {1.0, 2.0, 3.0, 
                            0.0, 5.0, 6.0};
     
-    double matrixB[3*2] = {2.0, 6.0,
+    float matrixB[3*2] = {2.0, 6.0,
                            3.0, 3.0,
                            4.0, 0.0};
     
-    // result matrices (initialized with random double values [0.0, 20.0])
-    double matrixC[2*2];
-    double matrixD[2*2];
+    // result matrices (initialized with random float values [0.0, 20.0])
+    float matrixC[2*2];
+    float matrixD[2*2];
     for(int i=0; i<2*2; i++) {
-        matrixC[i] = ((double)rand() / (double)RAND_MAX) * 20;
-        matrixD[i] = ((double)rand() / (double)RAND_MAX) * 20;
+        matrixC[i] = ((float)rand() / (float)RAND_MAX) * 20;
+        matrixD[i] = ((float)rand() / (float)RAND_MAX) * 20;
     }
     
     // ideal result
-    double matrixR[2*2] = {20.0, 12.0,
+    float matrixR[2*2] = {20.0, 12.0,
                            39.0, 15.0};
 
     multiply_matrix_naive(matrixA, matrixB, matrixC, 2, 3, 2);
@@ -32,7 +32,7 @@ int test_multiply_matrix()
     free_dpus();
 
     for(int i=0; i<2*2; i++) {
-        printf("%lf ", matrixC[i]);
+        printf("%f ", matrixC[i]);
     }
 
     for(int i=0; i<2*2; i++) {
@@ -47,17 +47,17 @@ int test_transpose_matrix()
 {
     int test_result_pass_fail = 1;
 
-    double matrixA[2*3] = {1.0, 2.0, 3.0, 
+    float matrixA[2*3] = {1.0, 2.0, 3.0, 
                            0.0, 5.0, 6.0};
 
-    // result matrix (initialized with random double values [0.0, 20.0])
-    double matrixT[3*2];
+    // result matrix (initialized with random float values [0.0, 20.0])
+    float matrixT[3*2];
     for(int i=0; i<3*2; i++) {
-        matrixT[i] = ((double)rand() / (double)RAND_MAX) * 20;
+        matrixT[i] = ((float)rand() / (float)RAND_MAX) * 20;
     }
 
     // ideal result
-    double matrixR[3*2] = {1.0, 0.0,
+    float matrixR[3*2] = {1.0, 0.0,
                            2.0, 5.0,
                            3.0, 6.0};
 
diff --git a/tests/test_sse.c b/tests/test_sse.c
index 8f660c0..9cacee5 100644
--- a/tests/test_sse.c
+++ b/tests/test_sse.c
@@ -3,11 +3,11 @@
 
 int test_sse()
 {
-    double real[]   = {3, 4, 4, 4};
-    double ideal[]  = {4, 4, 4, 4};
+    float real[]   = {3, 4, 4, 4};
+    float ideal[]  = {4, 4, 4, 4};
     int test_pass_fail = 1;
 
-    double sse_result = sse(real, ideal, 4);
+    float sse_result = sse(real, ideal, 4);
 
     test_pass_fail = test_pass_fail && (sse_result == 1);
 

From 207ada9044692b122e0461bb8c1707adc30937cc Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Sat, 22 Nov 2025 16:30:29 +0100
Subject: [PATCH 23/32] Introduce macro `TEST_FLOAT_EQ` & adapt unit tests

---
 include/test.h                          |  4 ++++
 tests/test_accumulate_layer_gradients.c |  2 +-
 tests/test_get_delta.c                  |  8 ++++++--
 tests/test_get_y.c                      | 12 ++++++------
 tests/test_get_z.c                      |  2 +-
 tests/test_sse.c                        |  8 ++++----
 6 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/include/test.h b/include/test.h
index 5cf9797..880f862 100644
--- a/include/test.h
+++ b/include/test.h
@@ -11,4 +11,8 @@ if(test_result == 0) \
 printf("PASS\n"); \
 return 0; \
 
+#define TEST_FLOAT_EQ(v1, v2, eps) (fabsf((v1) - (v2)) < (eps))
+
+#define EPS_TEST 1e-5
+
 #endif
\ No newline at end of file
diff --git a/tests/test_accumulate_layer_gradients.c b/tests/test_accumulate_layer_gradients.c
index 77cfc4e..87b7a92 100644
--- a/tests/test_accumulate_layer_gradients.c
+++ b/tests/test_accumulate_layer_gradients.c
@@ -29,7 +29,7 @@ int test_accumulate_layer_gradients()
 
     for(int i=0; i<4; i++)
         for(int j=0; j<5; j++)
-            test_pass_fail &= batch_dw_ideal[i][j] == first_layer->n[i].batch_dw[j];
+            test_pass_fail &= TEST_FLOAT_EQ(batch_dw_ideal[i][j], first_layer->n[i].batch_dw[j], EPS_TEST);
 
     return test_pass_fail;
 }
diff --git a/tests/test_get_delta.c b/tests/test_get_delta.c
index 068f347..97e63c1 100644
--- a/tests/test_get_delta.c
+++ b/tests/test_get_delta.c
@@ -32,7 +32,9 @@ int test_get_delta()
 
     for(int i=0; i<3; i++)
     {
-        test_pass_fail &= (d_last_layer[i] == (ideals[i] - get_y(n, 1, samples)[i]) * get_activation_derivative(get_z(n, 1, samples)[i]));
+        test_pass_fail &= TEST_FLOAT_EQ(d_last_layer[i],
+                                        (ideals[i] - get_y(n, 1, samples)[i]) * get_activation_derivative(get_z(n, 1, samples)[i]),
+                                        EPS_TEST);
     }
 
     // test before-last layer delta
@@ -41,7 +43,9 @@ int test_get_delta()
 
     for(int i=0; i<3; i++)
     {
-        test_pass_fail &= (d_first_layer[i] == (d_last_layer[0] + d_last_layer[1] + d_last_layer[2]) * get_activation_derivative(get_z(n, 0, samples)[i]));
+        test_pass_fail &= TEST_FLOAT_EQ(d_first_layer[i],
+                                        (d_last_layer[0] + d_last_layer[1] + d_last_layer[2]) * get_activation_derivative(get_z(n, 0, samples)[i]),
+                                        EPS_TEST);
     }
 
     return test_pass_fail;
diff --git a/tests/test_get_y.c b/tests/test_get_y.c
index de7cb02..30206bd 100644
--- a/tests/test_get_y.c
+++ b/tests/test_get_y.c
@@ -30,17 +30,17 @@ int test_get_y()
     // printf("y[1] == %.2f\n", y[1]);
     // printf("y[2] == %.2f\n", y[2]);
 
-    int test_pass_fail = (y[0] == 1)
-                        && (y[1] == get_activation(z[0]))
-                        && (y[2] == get_activation(z[1]));
+    int test_pass_fail = TEST_FLOAT_EQ(y[0], 1, EPS_TEST)
+                        && TEST_FLOAT_EQ(y[1], get_activation(z[0]), EPS_TEST)
+                        && TEST_FLOAT_EQ(y[2], get_activation(z[1]), EPS_TEST);
 
     y = get_y(n, 1, samples);
     z = get_z(n, 1, samples);
 
     test_pass_fail = test_pass_fail
-                    && (y[0] == get_activation(z[0]))
-                    && (y[1] == get_activation(z[1]))
-                    && (y[2] == get_activation(z[2]));
+                    && TEST_FLOAT_EQ(y[0], get_activation(z[0]), EPS_TEST)
+                    && TEST_FLOAT_EQ(y[1], get_activation(z[1]), EPS_TEST)
+                    && TEST_FLOAT_EQ(y[2], get_activation(z[2]), EPS_TEST);
 
     return test_pass_fail;
 }
diff --git a/tests/test_get_z.c b/tests/test_get_z.c
index 7244651..4367604 100644
--- a/tests/test_get_z.c
+++ b/tests/test_get_z.c
@@ -29,7 +29,7 @@ int test_get_z()
     // printf("z[1] == %.2f\n", z[1]);
     // printf("z[2] == %.2f\n", z[2]);
 
-    int test_pass_fail = (z[0] == 2) && (z[1] == 6) && (z[2] == -1);
+    int test_pass_fail = TEST_FLOAT_EQ(z[0], 2, EPS_TEST) && TEST_FLOAT_EQ(z[1], 6, EPS_TEST) && TEST_FLOAT_EQ(z[2], -1, EPS_TEST);
 
     return test_pass_fail;
 }
diff --git a/tests/test_sse.c b/tests/test_sse.c
index 9cacee5..732258a 100644
--- a/tests/test_sse.c
+++ b/tests/test_sse.c
@@ -9,20 +9,20 @@ int test_sse()
 
     float sse_result = sse(real, ideal, 4);
 
-    test_pass_fail = test_pass_fail && (sse_result == 1);
+    test_pass_fail &= TEST_FLOAT_EQ(sse_result, 1, EPS_TEST);
 
     real[0] = 4;
     sse_result = sse(real, ideal, 4);
-    test_pass_fail = test_pass_fail && (sse_result == 0);
+    test_pass_fail &= TEST_FLOAT_EQ(sse_result, 0, EPS_TEST);
 
     real[0] = 6;
     sse_result = sse(real, ideal, 4);
-    test_pass_fail = test_pass_fail && (sse_result == 4);
+    test_pass_fail &= TEST_FLOAT_EQ(sse_result, 4, EPS_TEST);
 
     real[0] = 6;
     real[1] = 2;
     sse_result = sse(real, ideal, 4);
-    test_pass_fail = test_pass_fail && (sse_result == 8);
+    test_pass_fail &= TEST_FLOAT_EQ(sse_result, 8, EPS_TEST);
 
     return test_pass_fail;
 }

From a7277ba384881ab9c10df260fd24bd711b3a2f59 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Sun, 23 Nov 2025 11:53:53 +0100
Subject: [PATCH 24/32] Update `TILE_SIZE` and `NUM_DPU`

---
 Makefile        | 2 +-
 include/upmem.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 5c9f6da..b9e2626 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 DPU_UPMEM_CLANG = dpu-upmem-dpurte-clang
-DPU_UPMEM_CFLAGS += -DNR_TASKLETS=4
+DPU_UPMEM_CFLAGS += -DNR_TASKLETS=16
 CFLAGS += -std=c99 -Iinclude -D_GNU_SOURCE -DVERBOSE -DDEBUG -DBATCH_SIZE=2 -DMAX_EPOCH=2 -DNUM_TRAIN_SAMPLES=8
 FILES_TO_DELETE = build/
 
diff --git a/include/upmem.h b/include/upmem.h
index 726ee31..c1fd7ee 100644
--- a/include/upmem.h
+++ b/include/upmem.h
@@ -8,11 +8,11 @@
 #endif
 
 #ifndef NUM_DPU
-#define NUM_DPU 8
+#define NUM_DPU 32
 #endif
 
 #ifndef TILE_SIZE
-#define TILE_SIZE 32
+#define TILE_SIZE 512
 #endif
 
 typedef struct {

From 6ddb943c2c934fb1a1d146171405cb01f5610fda Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Sun, 23 Nov 2025 11:54:22 +0100
Subject: [PATCH 25/32] Fix typo in CMake configuration file

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 26c5fc9..7f33b08 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,7 +32,7 @@ add_custom_target(build_dpu_program ALL
 )
 
 add_compile_definitions(
-    # NUM_CPU=1     Important: This macro override was commented because it does not apply to the dpu-upmem-dpurte-clang execution above; and therefore causes mismatch between
+    # NUM_DPU=1     Important: This macro override was commented because it does not apply to the dpu-upmem-dpurte-clang execution above; and therefore causes mismatch between
     #               dpu_program.c and the rest. So this file should avoid modifying dimensions set through macros in aforementioned header files.
     DPU_BINARY_PATH=\"./dpu_program\"
 )

From e2cb79322c959fb1226c16a6140ea2cfafade619 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Sun, 23 Nov 2025 12:01:17 +0100
Subject: [PATCH 26/32] Implement various improvements in top Makefile

---
 Makefile | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index b9e2626..5ee454a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,14 @@
 DPU_UPMEM_CLANG = dpu-upmem-dpurte-clang
 DPU_UPMEM_CFLAGS += -DNR_TASKLETS=16
-CFLAGS += -std=c99 -Iinclude -D_GNU_SOURCE -DVERBOSE -DDEBUG -DBATCH_SIZE=2 -DMAX_EPOCH=2 -DNUM_TRAIN_SAMPLES=8
-FILES_TO_DELETE = build/
+
+BATCH_SIZE ?= 20
+MAX_EPOCH ?= 10
+NUM_TRAIN_SAMPLES ?= 200
+
+CFLAGS += -std=c99 -Iinclude -D_GNU_SOURCE -DVERBOSE -DDEBUG
+CFLAGS += -DBATCH_SIZE=$(BATCH_SIZE) -DMAX_EPOCH=$(MAX_EPOCH) -DNUM_TRAIN_SAMPLES=$(NUM_TRAIN_SAMPLES)
+
+BUILD_DIR = build/
 
 UPMEM ?= 1
 ifeq ($(UPMEM), 1)
@@ -13,10 +20,10 @@ ifeq ($(SAN), 1)
 	CFLAGS += -fsanitize=address,undefined,leak -fno-omit-frame-pointer -g
 endif
 
-all:
-	mkdir build; \
+all: clean
+	mkdir $(BUILD_DIR); \
 	$(DPU_UPMEM_CLANG) $(DPU_UPMEM_CFLAGS) -Iinclude -o build/dpu_program src/dpu/dpu_program.c; \
 	gcc src/host/*.c $(CFLAGS) -o build/mlp -lm `dpu-pkg-config --cflags --libs dpu`
 
 clean:
-	rm -rf $(FILES_TO_DELETE)
+	rm -rf $(BUILD_DIR)

From 4035e75fbc42a0a582479fe4570cec0120460ba2 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Sun, 23 Nov 2025 12:10:34 +0100
Subject: [PATCH 27/32] Add performance evaluation mode `EVAL`

---
 Makefile            |  5 +++++
 include/upmem.h     |  2 ++
 src/host/dpu_host.c |  8 +++++++-
 src/host/mlp.c      | 10 +++++++++-
 4 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 5ee454a..e2f8548 100644
--- a/Makefile
+++ b/Makefile
@@ -20,6 +20,11 @@ ifeq ($(SAN), 1)
 	CFLAGS += -fsanitize=address,undefined,leak -fno-omit-frame-pointer -g
 endif
 
+EVAL ?= 0
+ifeq ($(EVAL), 1)
+	CFLAGS += -DEVAL
+endif
+
 all: clean
 	mkdir $(BUILD_DIR); \
 	$(DPU_UPMEM_CLANG) $(DPU_UPMEM_CFLAGS) -Iinclude -o build/dpu_program src/dpu/dpu_program.c; \
diff --git a/include/upmem.h b/include/upmem.h
index c1fd7ee..78bf601 100644
--- a/include/upmem.h
+++ b/include/upmem.h
@@ -15,6 +15,8 @@
 #define TILE_SIZE 512
 #endif
 
+#define EVAL_DPU_CC 458000000
+
 typedef struct {
     uint32_t rows_a;
     uint32_t cols_a;
diff --git a/src/host/dpu_host.c b/src/host/dpu_host.c
index ac203a0..c09115f 100644
--- a/src/host/dpu_host.c
+++ b/src/host/dpu_host.c
@@ -1,5 +1,6 @@
 #include <assert.h>
 #include <dpu.h>
+#include <x86intrin.h>
 #include "upmem.h"
 
 struct dpu_set_t dpus, dpu;
@@ -51,8 +52,13 @@ void multiply_matrix_upmem(const float *A, const float *B, float *C, int rows_a,
                         }
                     }
                 }
-    
+
+#ifdef EVAL
+                unsigned long long start = __rdtsc();
+                while(__rdtsc() - start < EVAL_DPU_CC);
+#else
                 process_tile_upmem(&tileA[0][0], &tileB[0][0], &tileC[0][0], TILE_SIZE, TILE_SIZE, TILE_SIZE);
+#endif
     
                 for(int row=0; row<TILE_SIZE; row++) {
                     for(int col=0; col<TILE_SIZE; col++) {
diff --git a/src/host/mlp.c b/src/host/mlp.c
index 6ef328b..b492cd8 100644
--- a/src/host/mlp.c
+++ b/src/host/mlp.c
@@ -1,6 +1,7 @@
 #include "mlp.h"
 #include "mnist.h"
 #include "upmem.h"
+#include <x86intrin.h>
 
 unsigned int rseed = 42;
 
@@ -60,6 +61,10 @@ int main()
 
     int num_batches = (NUM_TRAIN_SAMPLES + BATCH_SIZE - 1) / BATCH_SIZE;
 
+#ifdef EVAL
+    unsigned long long cc_start = __rdtsc();
+#endif
+
     while(1) {
 
         float learning_rate_epoch = LEARNING_RATE * powf(DECAY_RATE, epoch);
@@ -127,7 +132,10 @@ int main()
             break;
     }
 
-    printf("Training complete in %d epochs\n", epoch);
+#ifdef EVAL
+    unsigned long long cc_end = __rdtsc();
+    printf("Training complete | %lld cycles | %d epochs\n", cc_end-cc_start, epoch);
+#endif
 
 #ifdef DEBUG
     printf("\n===== Weights =====\n\n");

From a098509c497bc4f0ac3b6249c8a0ec8fc82d2949 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Sun, 23 Nov 2025 13:35:41 +0100
Subject: [PATCH 28/32] Add printout at program start for debugging

---
 src/host/mlp.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/host/mlp.c b/src/host/mlp.c
index b492cd8..de56b3b 100644
--- a/src/host/mlp.c
+++ b/src/host/mlp.c
@@ -46,6 +46,12 @@ int main()
     free_uint8_matrix(sample_data, sample_rows);
     free_uint8_matrix(label_data, label_rows);
 
+#ifdef UPMEM
+    printf("Run in UPMEM mode with BATCH_SIZE=%d, NUM_TRAIN_SAMPLES=%d, MAX_EPOCH=%d\n\n", BATCH_SIZE, NUM_TRAIN_SAMPLES, MAX_EPOCH);
+#else
+    printf("Run in HOST mode with BATCH_SIZE=%d, NUM_TRAIN_SAMPLES=%d, MAX_EPOCH=%d\n\n", BATCH_SIZE, NUM_TRAIN_SAMPLES, MAX_EPOCH);
+#endif
+
 #ifdef DEBUG
     // print samples & labels to check if all is saved correctly into program memory
     printf("===== Samples =====\n\n");

From efd75f3f9825675a801bf01fbd5c33b1f61e8ad0 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Sun, 23 Nov 2025 19:00:22 +0100
Subject: [PATCH 29/32] Update README

---
 README.md | 93 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 51 insertions(+), 42 deletions(-)

diff --git a/README.md b/README.md
index 7deb945..ae97435 100644
--- a/README.md
+++ b/README.md
@@ -1,99 +1,108 @@
 # UPMEM-MLP
 
-UPMEM-MLP is an attempt at implementing a multilayer perceptron application in pure C and accelerating this application on the UPMEM platform.
+UPMEM-MLP implements a multilayer perceptron training application in C and accelerates this application on the UPMEM platform.
 
-[![Unit Tests](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/unit_tests.yaml/badge.svg)](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/unit_tests.yaml) [![Valgrind](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/valgrind.yaml/badge.svg)](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/valgrind.yaml)
+[![Unit Tests](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/unit_tests.yaml/badge.svg)](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/unit_tests.yaml) [![Memory Leak Tests](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/memory_leak_tests.yaml/badge.svg)](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/memory_leak_tests.yaml)
 
-## Requirements
+## Prerequisites
 
-- GCC or Clang
 - CMake 3.10 or higher
+- GCC
+- Python
 - UPMEM SDK
 
-### Installing UPMEM SDK
+<details>
+<summary><b>Installing UPMEM SDK</b></summary><br>
 
-To set up the UPMEM SDK on your system:
+1. Download UPMEM SDK tarball for your system from [this link](https://github.com/kagandikmen/upmem-sdk)
 
-1. Download UPMEM SDK tarball for your system from [this link](https://sdk.upmem.com/)
+> **NOTICE:** UPMEM SDK is no longer downloadable on UPMEM's official SDK [Downloads](https://sdk.upmem.com) page.
 
 2. Extract its content and (preferably) move it to a better place like `/usr/local/bin/`
 
-3. Add the shell script `upmem_env.sh`, which sets necessary environment variables, to be sourced into your `.bashrc` as in:
+3. Add the shell script `upmem_env.sh`, which sets necessary environment variables, to be sourced into your `.bashrc`:
 
 ```bash
-source /usr/local/bin/upmem-sdk/upmem_env.sh > /dev/null
+source /usr/local/bin/upmem-sdk/upmem_env.sh simulator > /dev/null
 ```
 
 4. Restart your shell session for the changes to become effective
 
-5. Test your setup using:
+5. Test your setup:
 
 ```bash
 which dpu-lldb
 ```
+---
+</details>
 
-which should, if correctly installed, return the path to the LLDB Debugger binary of UPMEM SDK
+## Getting Started
 
-## Running the Unit Tests
-
-To run the CMake test flow:
+1. Clone this repository and navigate inside it:
 
 ```bash
-mkdir build
-cd build
-cmake ..
-make
-make test
+git clone https://github.com/OpenHardware-Initiative/UPMEM-MLP.git
+cd UPMEM-MLP
 ```
 
-## Compiling the Multilayer Perceptron Natively
-
-To natively run the C multilayer perceptron on your system:
-
-1. Create a Python virtual environment (optional, but recommended) and install requirements:
+2. **(Optional, but recommended)** Create a Python virtual environment:
 
 ```bash
 python3 -m venv venv
 source venv/bin/activate
+```
+
+3. Install Python requirements:
+
+```bash
 pip install -r requirements.txt
 ```
 
-2. Extract training samples & labels:
+4. Extract training samples & labels:
 
 ```bash
 python3 read_dataset.py
 ```
 
-3. Compile the application:
+5. Compile the MLP:
 
 ```bash
 make
 ```
 
+6. Run the MLP:
+
+```bash
+./build/mlp
+```
+
 With this command, you can use:
 
-- `-DVERBOSE` for the verbose mode, which prints loss deltas for all epochs
-- `-DDEBUG` for the debug mode, which prints a couple samples & labels at the beginning and all weights at the end
-- `-DBATCH_SIZE=...` to configure the batch size used during training
-- `-DMAX_EPOCH=...` to configure the maximum number of epochs the training can run for
-- `-DEPSILON=...` to configure epsilon from the command line
-- `-DLEARNING_RATE=...` to configure learning rate from the command line
-- `-DDECAY_RATE=...` to configure the decay rate of the learning rate
-- `-DMOMENTUM=...` to configure momentum from the command line
-- `-DNUM_TRAIN_SAMPLES=...` to configure from the command line how many samples the model should be trained with
-- `-DTRAINING_SAMPLES_FILE=...` to configure the path to the text file samples should be sourced from
-- `-DTRAINING_LABELS_FILE=...` to configure the path to the text file labels should be sourced from
+- `BATCH_SIZE=...` to configure the batch size used during training, which otherwise defaults to 20
+- `MAX_EPOCH=...` to configure the maximum number of epochs the training can run for, which otherwise defaults to 10
+- `NUM_TRAIN_SAMPLES=...` to configure from the command line how many samples the model should be trained with, which otherwise defaults to 200
+- `UPMEM=0` to turn off matrix multiplication on UPMEM
+- `SAN=1` to run the MLP with GCC sanitizer
+- `EVAL=1` to run the MLP in evaluation mode, which adds to the printout how many cycles are spent in training
 
-## Status
+## Running the Unit Tests
+
+UPMEM-MLP comes with unit tests, which can be found in `tests/`. Run these unit tests using:
 
-UPMEM-MLP is a work in progress as of 2025-11-21.
+```bash
+mkdir build
+cd build
+cmake ..
+make
+make test
+```
 
-### To-Do
+## Status
 
-- [ ] Evaluate and document acceleration achieved by matrix multiplication on UPMEM DIMM
+UPMEM-MLP is completed and being actively maintained as of 2025-11-23.
 
 ## License
 
 UPMEM-MLP is licensed under the Apache License v2.0. See [LICENSE](LICENSE) for more details.
 
----
\ No newline at end of file
+---

From 6b5c2649188f3935e2cbb3bc35254af4c4f8bff4 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Sun, 23 Nov 2025 22:00:44 +0100
Subject: [PATCH 30/32] Add some logging to `src/dpu/dpu_program.c`

---
 src/dpu/dpu_program.c | 11 +++++++++++
 src/host/dpu_host.c   |  5 +++++
 2 files changed, 16 insertions(+)

diff --git a/src/dpu/dpu_program.c b/src/dpu/dpu_program.c
index 1155bac..cea4413 100644
--- a/src/dpu/dpu_program.c
+++ b/src/dpu/dpu_program.c
@@ -1,5 +1,7 @@
 #include <mram.h>
 #include <defs.h>
+#include <perfcounter.h>
+#include <stdio.h>
 #include "upmem.h"
 
 __mram_noinit float A_chunk[TILE_SIZE * TILE_SIZE];
@@ -10,6 +12,8 @@ __host dpu_args_t DPU_INPUT_ARGS;
 
 int main()
 {
+    perfcounter_config(COUNT_CYCLES, false);
+
     dpu_args_t dpu_input_args = DPU_INPUT_ARGS;
     uint32_t rows_a = dpu_input_args.rows_a;
     uint32_t cols_a = dpu_input_args.cols_a;
@@ -18,6 +22,8 @@ int main()
     if(!rows_a)
         return 0;
 
+    perfcounter_t cc_start = perfcounter_get();
+
     int chunk = rows_a / NR_TASKLETS;
     int row_start = chunk * me();
 
@@ -30,6 +36,11 @@ int main()
             C_chunk[i * cols_b + j] = sum;
         }
     }
+
+    perfcounter_t cc_end = perfcounter_get();
+
+    if(me() == 0)
+        printf("DPU completed in %ld cycles\n", cc_end-cc_start);
     
     return 0;
 }
\ No newline at end of file
diff --git a/src/host/dpu_host.c b/src/host/dpu_host.c
index c09115f..17bc719 100644
--- a/src/host/dpu_host.c
+++ b/src/host/dpu_host.c
@@ -1,5 +1,6 @@
 #include <assert.h>
 #include <dpu.h>
+#include <dpu_log.h>
 #include <x86intrin.h>
 #include "upmem.h"
 
@@ -115,6 +116,10 @@ void process_tile_upmem(const float *A, const float *B, float *C, int rows_a, in
 
     DPU_ASSERT(dpu_launch(dpus, DPU_SYNCHRONOUS));
 
+    DPU_FOREACH(dpus, dpu) {
+        DPU_ASSERT(dpu_log_read(dpu, stdout));
+    }
+
     dpu_idx = 0;
     DPU_FOREACH(dpus, dpu) {
         

From 5f3fb0ab959e89de53c6c6193db05cefd08f883f Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Sun, 23 Nov 2025 22:01:41 +0100
Subject: [PATCH 31/32] Register benchmarking results in `benchmarks.md`

---
 benchmarks.md | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 benchmarks.md

diff --git a/benchmarks.md b/benchmarks.md
new file mode 100644
index 0000000..ce5ca11
--- /dev/null
+++ b/benchmarks.md
@@ -0,0 +1,10 @@
+# Benchmark Results
+
+## NN Layout: NUM_FEATURES -> 4096 -> 4096 -> 2048 -> NUM_LABELS
+
+| BATCH_SIZE | NUM_TRAIN_SAMPLES | MAX_EPOCH | Cycles (Intel 64 Host) | Cycles (Intel 64 Host + UPMEM) |
+|------------|-------------------|-----------|------------------------|--------------------------------|
+| 1200       | 3600              | 1         | 13.05T                 | 12.73T                         |
+| 3600       | 10800             | 1         | 42.38T                 | 39.49T                         |
+
+---

From c9d9af1b21c03f79c5b868d2bbcc66ce0328f151 Mon Sep 17 00:00:00 2001
From: Kagan Dikmen <kagandikmen@outlook.com>
Date: Sun, 23 Nov 2025 22:03:02 +0100
Subject: [PATCH 32/32] Update CI for recent changes in upmem-sdk repo

---
 .github/build_upmem_toolchain.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/build_upmem_toolchain.sh b/.github/build_upmem_toolchain.sh
index cf7b9ab..cd157ec 100644
--- a/.github/build_upmem_toolchain.sh
+++ b/.github/build_upmem_toolchain.sh
@@ -2,6 +2,6 @@
 
 cd /opt/
 git clone https://github.com/kagandikmen/upmem-sdk.git
-tar -xvf upmem-sdk/upmem-2024.2.0-Linux-x86_64.tar.gz
+tar -xvf upmem-sdk/2024.2.0/upmem-2024.2.0-Linux-x86_64.tar.gz
 mv upmem-2024.2.0-Linux-x86_64/ /usr/local/bin/
 rm -rf upmem-sdk/
\ No newline at end of file