diff --git a/.github/build_upmem_toolchain.sh b/.github/build_upmem_toolchain.sh
new file mode 100644
index 0000000..cd157ec
--- /dev/null
+++ b/.github/build_upmem_toolchain.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+cd /opt/
+git clone https://github.com/kagandikmen/upmem-sdk.git
+tar -xvf upmem-sdk/2024.2.0/upmem-2024.2.0-Linux-x86_64.tar.gz
+mv upmem-2024.2.0-Linux-x86_64/ /usr/local/bin/
+rm -rf upmem-sdk/
\ No newline at end of file
diff --git a/.github/workflows/valgrind.yaml b/.github/workflows/memory_leak_tests.yaml
similarity index 50%
rename from .github/workflows/valgrind.yaml
rename to .github/workflows/memory_leak_tests.yaml
index 69cfa97..e6a8ba3 100644
--- a/.github/workflows/valgrind.yaml
+++ b/.github/workflows/memory_leak_tests.yaml
@@ -1,4 +1,4 @@
-name: Valgrind
+name: Memory Leak Tests
 
 on:
   push:
@@ -6,7 +6,7 @@ on:
 
 jobs:
   memcheck:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - name: Checkout repository
@@ -19,25 +19,40 @@ jobs:
           sudo apt update
           sudo apt install -y build-essential valgrind
           pip3 install numpy
+          sudo bash .github/build_upmem_toolchain.sh
 
       - name: Extract training samples & labels
         run: python3 read_dataset.py
 
-      - name: Compile MLP
-        run: gcc -g -DEPSILON=0.5 -DNUM_TRAIN_SAMPLES=2 -Iinclude src/*.c -o mlp -lm
+      - name: Compile MLP without sanitizer or UPMEM
+        run: |
+          source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator
+          make SAN=0 UPMEM=0
 
       - name: Run Valgrind
         run: | 
+          source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator
           valgrind --leak-check=full \
                    --show-leak-kinds=all \
                    --track-origins=yes \
                    --error-exitcode=1 \
                    --log-file=valgrind.txt \
-                   ./mlp > /dev/null
+                   ./build/mlp > /dev/null
       
       - name: Save Valgrind log
         if: always()
         uses: actions/upload-artifact@v4
         with:
           name: valgrind_log
-          path: valgrind.txt
\ No newline at end of file
+          path: valgrind.txt
+
+      - name: Compile MLP with sanitizer and UPMEM
+        run: |
+          source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator
+          make clean
+          make SAN=1 UPMEM=1
+
+      - name: Run with sanitizer
+        run: |
+          source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator
+          ./build/mlp > /dev/null
\ No newline at end of file
diff --git a/.github/workflows/unit_tests.yaml b/.github/workflows/unit_tests.yaml
index cba3dcc..897d053 100644
--- a/.github/workflows/unit_tests.yaml
+++ b/.github/workflows/unit_tests.yaml
@@ -6,7 +6,7 @@ on:
 
 jobs:
   build-and-test:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     
     steps:
       - name: Checkout repository
@@ -15,19 +15,27 @@ jobs:
           submodules: 'recursive'
 
       - name: Install dependencies
-        run: sudo apt update && sudo apt install -y build-essential
+        run: | 
+          sudo apt update && sudo apt install -y build-essential python3.10 python3.10-dev
+          sudo bash .github/build_upmem_toolchain.sh
 
       - name: Create build directory
         run: mkdir build
       
       - name: Run CMake
         working-directory: build
-        run: cmake ..
+        run: |
+          source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh
+          cmake ..
 
       - name: Build
         working-directory: build
-        run: make
+        run: |
+          source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh
+          make
 
       - name: Run the tests
         working-directory: build
-        run: make test
\ No newline at end of file
+        run: |
+          source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh
+          make test
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 24a870c..274d4e3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,5 @@
 matmul.c
 matrices.h
-dpu/
 *.o
 *.out
 training_images.txt
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d408d10..7f33b08 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,18 +6,44 @@ set(CMAKE_C_STANDARD_REQUIRED ON)
 
 include_directories(include)
 
-file(GLOB SRC_FILES src/*.c)
-list(REMOVE_ITEM SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/mlp.c")
+file(GLOB SRC_FILES src/host/*.c)
+list(REMOVE_ITEM SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/host/mlp.c")
 file(GLOB TEST_FILES tests/*.c)
 
+execute_process(
+    COMMAND dpu-pkg-config --cflags dpu
+    OUTPUT_VARIABLE DPU_C_FLAGS
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+execute_process(
+    COMMAND dpu-pkg-config --libs dpu
+    OUTPUT_VARIABLE DPU_LIBS
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
 enable_testing()
 
+add_custom_target(build_dpu_program ALL
+    COMMAND dpu-upmem-dpurte-clang
+            -I${CMAKE_SOURCE_DIR}/include
+            -o ${CMAKE_BINARY_DIR}/dpu_program
+            ${CMAKE_SOURCE_DIR}/src/dpu/dpu_program.c
+)
+
+add_compile_definitions(
+    # NUM_DPU=1     Important: This macro override was commented because it does not apply to the dpu-upmem-dpurte-clang execution above; and therefore causes mismatch between
+    #               dpu_program.c and the rest. So this file should avoid modifying dimensions set through macros in aforementioned header files.
+    DPU_BINARY_PATH=\"./dpu_program\"
+)
+
 foreach(TEST_SRC ${TEST_FILES})
     get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE)
 
     add_executable(${TEST_NAME} ${TEST_SRC} ${SRC_FILES})
     target_include_directories(${TEST_NAME} PRIVATE include)
-    target_link_libraries(${TEST_NAME} m)
+    target_compile_options(${TEST_NAME} PRIVATE ${DPU_C_FLAGS})
+    target_link_libraries(${TEST_NAME} PRIVATE m ${DPU_LIBS})
 
     add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
 endforeach()
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 9c5cb69..e2f8548 100644
--- a/Makefile
+++ b/Makefile
@@ -1,17 +1,34 @@
-CLANG = dpu-upmem-dpurte-clang
-SOURCE = matmul
-CFLAGS += -O0 -DNR_TASKLETS=6
-FILESTODELETE = matmul.c dpu/
-
-all:
-	python3 generate.py && \
-	for test in $$(seq 0 15); do \
-		$(CLANG) $(CFLAGS) -o dpu/dpu$$test/${SOURCE}.o dpu/dpu$$test/${SOURCE}.c; \
-	done
-	gcc --std=c99 host.c -o host.o `dpu-pkg-config --cflags --libs dpu`
+DPU_UPMEM_CLANG = dpu-upmem-dpurte-clang
+DPU_UPMEM_CFLAGS += -DNR_TASKLETS=16
 
-clean:
-	rm -rf *.o ${FILESTODELETE}
+BATCH_SIZE ?= 20
+MAX_EPOCH ?= 10
+NUM_TRAIN_SAMPLES ?= 200
+
+CFLAGS += -std=c99 -Iinclude -D_GNU_SOURCE -DVERBOSE -DDEBUG
+CFLAGS += -DBATCH_SIZE=$(BATCH_SIZE) -DMAX_EPOCH=$(MAX_EPOCH) -DNUM_TRAIN_SAMPLES=$(NUM_TRAIN_SAMPLES)
+
+BUILD_DIR = build/
+
+UPMEM ?= 1
+ifeq ($(UPMEM), 1)
+	CFLAGS += -DUPMEM
+endif
 
-clean_all:
-	rm -rf *.o .vscode/ .cache/ .__pycache__/ training_images.txt training_labels.txt
\ No newline at end of file
+SAN ?= 0
+ifeq ($(SAN), 1)
+	CFLAGS += -fsanitize=address,undefined,leak -fno-omit-frame-pointer -g
+endif
+
+EVAL ?= 0
+ifeq ($(EVAL), 1)
+	CFLAGS += -DEVAL
+endif
+
+all: clean
+	mkdir $(BUILD_DIR); \
+	$(DPU_UPMEM_CLANG) $(DPU_UPMEM_CFLAGS) -Iinclude -o build/dpu_program src/dpu/dpu_program.c; \
+	gcc src/host/*.c $(CFLAGS) -o build/mlp -lm `dpu-pkg-config --cflags --libs dpu`
+
+clean:
+	rm -rf $(BUILD_DIR)
diff --git a/README.md b/README.md
index a4acb69..ae97435 100644
--- a/README.md
+++ b/README.md
@@ -1,99 +1,108 @@
 # UPMEM-MLP
 
-UPMEM-MLP is an attempt at implementing a multilayer perceptron application in pure C and accelerating this application on the UPMEM platform.
+UPMEM-MLP implements a multilayer perceptron training application in C and accelerates this application on the UPMEM platform.
 
-[![Unit Tests](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/unit_tests.yaml/badge.svg)](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/unit_tests.yaml) [![Valgrind](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/valgrind.yaml/badge.svg)](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/valgrind.yaml)
+[![Unit Tests](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/unit_tests.yaml/badge.svg)](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/unit_tests.yaml) [![Memory Leak Tests](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/memory_leak_tests.yaml/badge.svg)](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/memory_leak_tests.yaml)
 
-## Requirements
+## Prerequisites
 
-- GCC or Clang
 - CMake 3.10 or higher
+- GCC
+- Python
 - UPMEM SDK
 
-### Installing UPMEM SDK
+<details>
+<summary><b>Installing UPMEM SDK</b></summary><br>
 
-To set up the UPMEM SDK on your system:
+1. Download UPMEM SDK tarball for your system from [this link](https://github.com/kagandikmen/upmem-sdk)
 
-1. Download UPMEM SDK tarball for your system from [this link](https://sdk.upmem.com/)
+> **NOTICE:** UPMEM SDK is no longer downloadable on UPMEM's official SDK [Downloads](https://sdk.upmem.com) page.
 
 2. Extract its content and (preferably) move it to a better place like `/usr/local/bin/`
 
-3. Add the shell script `upmem_env.sh`, which sets necessary environment variables, to be sourced into your `.bashrc` as in:
+3. Add the shell script `upmem_env.sh`, which sets necessary environment variables, to be sourced into your `.bashrc`:
 
 ```bash
-source /usr/local/bin/upmem-sdk/upmem_env.sh > /dev/null
+source /usr/local/bin/upmem-sdk/upmem_env.sh simulator > /dev/null
 ```
 
 4. Restart your shell session for the changes to become effective
 
-5. Test your setup using:
+5. Test your setup:
 
 ```bash
 which dpu-lldb
 ```
+---
+</details>
 
-which should, if correctly installed, return the path to the LLDB Debugger binary of UPMEM SDK
+## Getting Started
 
-## Running the Unit Tests
-
-To run the CMake test flow:
+1. Clone this repository and navigate inside it:
 
 ```bash
-mkdir build
-cd build
-cmake ..
-make
-make test
+git clone https://github.com/OpenHardware-Initiative/UPMEM-MLP.git
+cd UPMEM-MLP
 ```
 
-## Compiling the Multilayer Perceptron Natively
-
-To natively run the C multilayer perceptron on your system:
-
-1. Create a Python virtual environment (optional, but recommended) and install requirements:
+2. **(Optional, but recommended)** Create a Python virtual environment:
 
 ```bash
 python3 -m venv venv
 source venv/bin/activate
+```
+
+3. Install Python requirements:
+
+```bash
 pip install -r requirements.txt
 ```
 
-2. Extract training samples & labels:
+4. Extract training samples & labels:
 
 ```bash
 python3 read_dataset.py
 ```
 
-3. Compile the application:
+5. Compile the MLP:
 
 ```bash
-gcc -Iinclude src/*.c -o mlp -lm
+make
+```
+
+6. Run the MLP:
+
+```bash
+./build/mlp
 ```
 
 With this command, you can use:
 
-- `-DVERBOSE` for the verbose mode, which prints loss deltas for all epochs
-- `-DDEBUG` for the debug mode, which prints a couple samples & labels at the beginning and all weights at the end
-- `-DBATCH_SIZE=...` to configure the batch size used during training
-- `-DMAX_EPOCH=...` to configure the maximum number of epochs the training can run for
-- `-DEPSILON=...` to configure epsilon from the command line
-- `-DLEARNING_RATE=...` to configure learning rate from the command line
-- `-DDECAY_RATE=...` to configure the decay rate of the learning rate
-- `-DMOMENTUM=...` to configure momentum from the command line
-- `-DNUM_TRAIN_SAMPLES=...` to configure from the command line how many samples the model should be trained with
-- `-DTRAINING_SAMPLES_FILE=...` to configure the path to the text file samples should be sourced from
-- `-DTRAINING_LABELS_FILE=...` to configure the path to the text file labels should be sourced from
+- `BATCH_SIZE=...` to configure the batch size used during training, which otherwise defaults to 20
+- `MAX_EPOCH=...` to configure the maximum number of epochs the training can run for, which otherwise defaults to 10
+- `NUM_TRAIN_SAMPLES=...` to configure from the command line how many samples the model should be trained with, which otherwise defaults to 200
+- `UPMEM=0` to turn off matrix multiplication on UPMEM
+- `SAN=1` to run the MLP with GCC sanitizer
+- `EVAL=1` to run the MLP in evaluation mode, which adds to the printout how many cycles are spent in training
 
-## Status
+## Running the Unit Tests
+
+UPMEM-MLP comes with unit tests, which can be found in `tests/`. Run these unit tests using:
 
-UPMEM-MLP is a work in progress as of 2025-11-14.
+```bash
+mkdir build
+cd build
+cmake ..
+make
+make test
+```
 
-### To-Do
+## Status
 
-- [ ] Adapt `multiply_matrix` for in-memory matrix multiplication on UPMEM
+UPMEM-MLP is completed and being actively maintained as of 2025-11-23.
 
 ## License
 
 UPMEM-MLP is licensed under the Apache License v2.0. See [LICENSE](LICENSE) for more details.
 
----
\ No newline at end of file
+---
diff --git a/benchmarks.md b/benchmarks.md
new file mode 100644
index 0000000..ce5ca11
--- /dev/null
+++ b/benchmarks.md
@@ -0,0 +1,10 @@
+# Benchmark Results
+
+## NN Layout: NUM_FEATURES -> 4096 -> 4096 -> 2048 -> NUM_LABELS
+
+| BATCH_SIZE | NUM_TRAIN_SAMPLES | MAX_EPOCH | Cycles (Intel 64 Host) | Cycles (Intel 64 Host + UPMEM) |
+|------------|-------------------|-----------|------------------------|--------------------------------|
+| 1200       | 3600              | 1         | 13.05T                 | 12.73T                         |
+| 3600       | 10800             | 1         | 42.38T                 | 39.49T                         |
+
+---
diff --git a/include/mlp.h b/include/mlp.h
index b2a6616..081cea9 100644
--- a/include/mlp.h
+++ b/include/mlp.h
@@ -35,13 +35,13 @@ extern unsigned int rseed;
 
 typedef struct {
     int num_weights;
-    double *w, *lw;
-    double *batch_dw;
+    float *w, *lw;
+    float *batch_dw;
 } NEURON;
 
 typedef struct {
     int num_neurons;
-    double *inputs, *deltas;
+    float *inputs, *deltas;
     NEURON *n;
 } LAYER;
 
@@ -50,22 +50,23 @@ typedef struct {
     LAYER *l;
 } NETWORK;
 
-void accumulate_layer_gradients(LAYER *l, int batch_size, double learning_rate);
+void accumulate_layer_gradients(LAYER *l, int batch_size, float learning_rate);
 void apply_gradients(NETWORK *n, int batch_size);
-double drand();
-double get_activation(double x);
-double get_activation_derivative(double x);
-double *get_delta(NETWORK *n, double *samples, double *ideal, int layer_index);
-double *get_total_loss(NETWORK *n, double **samples, double **ideal, int nsamples);
-double *get_y(NETWORK *n, int layer_index, double *sample);
-double *get_z(NETWORK *n, int layer_index, double *sample);
+float drand();
+float get_activation(float x);
+float get_activation_derivative(float x);
+float *get_delta(NETWORK *n, float *samples, float *ideal, int layer_index);
+float *get_total_loss(NETWORK *n, float **samples, float **ideal, int nsamples);
+float *get_y(NETWORK *n, int layer_index, float *sample);
+float *get_z(NETWORK *n, int layer_index, float *sample);
 LAYER *init_layer(int num_neurons, int num_weights_per_neuron, int batch_size);
 NETWORK *init_network(int num_inputs, int num_layers, int *num_inputs_per_layer, int batch_size);
 NEURON *init_neuron(int num_weights);
-void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b);
+void multiply_matrix(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b);
+void multiply_matrix_naive(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b);
 uint8_t **read_image_data(const char *filename, int *num_rows, const int num_cols);
-double sse(double *real, double *ideal, int length);
-void transpose_matrix(const double *A, double *C, int rows, int cols);
+float sse(float *real, float *ideal, int length);
+void transpose_matrix(const float *A, float *C, int rows, int cols);
 
 //
 // utility functions
@@ -75,10 +76,10 @@ void free_layer(LAYER *l);
 void free_network(NETWORK *n);
 void free_neuron(NEURON *n);
 
-void free_double_matrix(double **addr, int nrows);
+void free_float_matrix(float **addr, int nrows);
 void free_uint8_matrix(uint8_t **addr, int nrows);
 
-void print_double_matrix(double **addr, int nrows, int ncols);
-void print_double_vector(double *addr, int nrows);
+void print_float_matrix(float **addr, int nrows, int ncols);
+void print_float_vector(float *addr, int nrows);
 
 #endif
diff --git a/include/test.h b/include/test.h
index 5cf9797..880f862 100644
--- a/include/test.h
+++ b/include/test.h
@@ -11,4 +11,8 @@ if(test_result == 0) \
 printf("PASS\n"); \
 return 0; \
 
+#define TEST_FLOAT_EQ(v1, v2, eps) (fabsf((v1) - (v2)) < (eps))
+
+#define EPS_TEST 1e-5
+
 #endif
\ No newline at end of file
diff --git a/include/upmem.h b/include/upmem.h
new file mode 100644
index 0000000..78bf601
--- /dev/null
+++ b/include/upmem.h
@@ -0,0 +1,33 @@
+#ifndef UPMEM_H
+#define UPMEM_H
+
+#include <stdint.h>
+
+#ifndef DPU_BINARY_PATH
+#define DPU_BINARY_PATH "build/dpu_program"
+#endif
+
+#ifndef NUM_DPU
+#define NUM_DPU 32
+#endif
+
+#ifndef TILE_SIZE
+#define TILE_SIZE 512
+#endif
+
+#define EVAL_DPU_CC 458000000
+
+typedef struct {
+    uint32_t rows_a;
+    uint32_t cols_a;
+    uint32_t cols_b;
+} dpu_args_t;
+
+extern int upmem_initialized;
+
+void free_dpus();
+void init_dpus();
+void multiply_matrix_upmem(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b);
+void process_tile_upmem(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b);
+
+#endif
diff --git a/legacy/Makefile b/legacy/Makefile
new file mode 100644
index 0000000..9c5cb69
--- /dev/null
+++ b/legacy/Makefile
@@ -0,0 +1,17 @@
+CLANG = dpu-upmem-dpurte-clang
+SOURCE = matmul
+CFLAGS += -O0 -DNR_TASKLETS=6
+FILESTODELETE = matmul.c dpu/
+
+all:
+	python3 generate.py && \
+	for test in $$(seq 0 15); do \
+		$(CLANG) $(CFLAGS) -o dpu/dpu$$test/${SOURCE}.o dpu/dpu$$test/${SOURCE}.c; \
+	done
+	gcc --std=c99 host.c -o host.o `dpu-pkg-config --cflags --libs dpu`
+
+clean:
+	rm -rf *.o ${FILESTODELETE}
+
+clean_all:
+	rm -rf *.o .vscode/ .cache/ .__pycache__/ training_images.txt training_labels.txt
\ No newline at end of file
diff --git a/generate.py b/legacy/generate.py
similarity index 100%
rename from generate.py
rename to legacy/generate.py
diff --git a/host.c b/legacy/host.c
similarity index 100%
rename from host.c
rename to legacy/host.c
diff --git a/matmul.template b/legacy/matmul.template
similarity index 100%
rename from matmul.template
rename to legacy/matmul.template
diff --git a/matrices.template b/legacy/matrices.template
similarity index 100%
rename from matrices.template
rename to legacy/matrices.template
diff --git a/src/activation.c b/src/activation.c
deleted file mode 100644
index eeaaee7..0000000
--- a/src/activation.c
+++ /dev/null
@@ -1,11 +0,0 @@
-#include "mlp.h"
-
-double get_activation(double x)
-{
-    return tanh(x);
-}
-
-double get_activation_derivative(double x)
-{
-    return 1.0 / pow(cosh(x), 2);
-}
\ No newline at end of file
diff --git a/src/dpu/dpu_program.c b/src/dpu/dpu_program.c
new file mode 100644
index 0000000..cea4413
--- /dev/null
+++ b/src/dpu/dpu_program.c
@@ -0,0 +1,46 @@
+#include <mram.h>
+#include <defs.h>
+#include <perfcounter.h>
+#include <stdio.h>
+#include "upmem.h"
+
+__mram_noinit float A_chunk[TILE_SIZE * TILE_SIZE];
+__mram_noinit float B_whole[TILE_SIZE * TILE_SIZE];
+__mram_noinit float C_chunk[TILE_SIZE * TILE_SIZE];
+
+__host dpu_args_t DPU_INPUT_ARGS;
+
+int main()
+{
+    perfcounter_config(COUNT_CYCLES, false);
+
+    dpu_args_t dpu_input_args = DPU_INPUT_ARGS;
+    uint32_t rows_a = dpu_input_args.rows_a;
+    uint32_t cols_a = dpu_input_args.cols_a;
+    uint32_t cols_b = dpu_input_args.cols_b;
+    
+    if(!rows_a)
+        return 0;
+
+    perfcounter_t cc_start = perfcounter_get();
+
+    int chunk = rows_a / NR_TASKLETS;
+    int row_start = chunk * me();
+
+    for(int i=row_start; i<(row_start+chunk); ++i) {
+        for(int j=0; j<cols_b; ++j) {
+            float sum = 0;
+            for(int k=0; k<cols_a; ++k) {
+                sum += A_chunk[i * cols_a + k] * B_whole[k * cols_b + j];
+            }
+            C_chunk[i * cols_b + j] = sum;
+        }
+    }
+
+    perfcounter_t cc_end = perfcounter_get();
+
+    if(me() == 0)
+        printf("DPU completed in %ld cycles\n", cc_end-cc_start);
+    
+    return 0;
+}
\ No newline at end of file
diff --git a/src/drand.c b/src/drand.c
deleted file mode 100644
index 05f07a4..0000000
--- a/src/drand.c
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "mlp.h"
-
-double drand()
-{
-    return (double)rand()/(double)RAND_MAX;
-}
-
diff --git a/src/accumulate_layer_gradients.c b/src/host/accumulate_layer_gradients.c
similarity index 74%
rename from src/accumulate_layer_gradients.c
rename to src/host/accumulate_layer_gradients.c
index 8337cf1..852a701 100644
--- a/src/accumulate_layer_gradients.c
+++ b/src/host/accumulate_layer_gradients.c
@@ -1,6 +1,6 @@
 #include "mlp.h"
 
-void accumulate_layer_gradients(LAYER *l, int batch_size, double learning_rate)
+void accumulate_layer_gradients(LAYER *l, int batch_size, float learning_rate)
 {
     if(batch_size <= 0)
         return;
@@ -8,12 +8,12 @@ void accumulate_layer_gradients(LAYER *l, int batch_size, double learning_rate)
     int num_neurons = l->num_neurons;
     int num_weights = l->n->num_weights;
 
-    double *gradient = (double *) malloc (num_neurons * num_weights * sizeof(double));
+    float *gradient = (float *) malloc (num_neurons * num_weights * sizeof(float));
     if(!gradient) {
         return;
     }
 
-    double *deltas_T = (double*) malloc (num_neurons * batch_size * sizeof(double));
+    float *deltas_T = (float*) malloc (num_neurons * batch_size * sizeof(float));
     if(!deltas_T) {
         free(gradient);
         return;
diff --git a/src/host/activation.c b/src/host/activation.c
new file mode 100644
index 0000000..5345ec6
--- /dev/null
+++ b/src/host/activation.c
@@ -0,0 +1,11 @@
+#include "mlp.h"
+
+float get_activation(float x)
+{
+    return tanhf(x);
+}
+
+float get_activation_derivative(float x)
+{
+    return 1.0 / powf(coshf(x), 2);
+}
\ No newline at end of file
diff --git a/src/apply_gradients.c b/src/host/apply_gradients.c
similarity index 71%
rename from src/apply_gradients.c
rename to src/host/apply_gradients.c
index 4bc143b..ede95e7 100644
--- a/src/apply_gradients.c
+++ b/src/host/apply_gradients.c
@@ -15,11 +15,11 @@ void apply_gradients(NETWORK *n, int batch_size)
 
             for(int k=0; k<np->num_weights; k++)    // do the following for all weights "k" of said neuron:
             {
-                double previous_weight_update = np->w[k] - np->lw[k];
-                double momentum_term = MOMENTUM * previous_weight_update;
-                double gradient_term = np->batch_dw[k] / (double) batch_size;
+                float previous_weight_update = np->w[k] - np->lw[k];
+                float momentum_term = MOMENTUM * previous_weight_update;
+                float gradient_term = np->batch_dw[k] / (float) batch_size;
 
-                double old_weight = np->w[k];
+                float old_weight = np->w[k];
 
                 np->lw[k] = old_weight;
                 np->w[k] = old_weight + gradient_term + momentum_term;
diff --git a/src/host/dpu_host.c b/src/host/dpu_host.c
new file mode 100644
index 0000000..17bc719
--- /dev/null
+++ b/src/host/dpu_host.c
@@ -0,0 +1,149 @@
+#include <assert.h>
+#include <dpu.h>
+#include <dpu_log.h>
+#include <x86intrin.h>
+#include "upmem.h"
+
+struct dpu_set_t dpus, dpu;
+int upmem_initialized = 0;
+
+void free_dpus()
+{
+    DPU_ASSERT(dpu_free(dpus));
+}
+
+void init_dpus()
+{
+    if(!upmem_initialized) {
+        DPU_ASSERT(dpu_alloc(NUM_DPU, NULL, &dpus));
+        DPU_ASSERT(dpu_load(dpus, DPU_BINARY_PATH, NULL));
+
+        upmem_initialized = 1;
+    }
+}
+
+void multiply_matrix_upmem(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b)
+{
+    float tileA[TILE_SIZE][TILE_SIZE];
+    float tileB[TILE_SIZE][TILE_SIZE];
+    float tileC[TILE_SIZE][TILE_SIZE];
+
+    for(int i=0; i<rows_a; ++i) {
+        for(int j=0; j<cols_b; ++j) {
+            C[i * cols_b + j] = 0;
+        }
+    }
+
+    for(int ii=0; ii<rows_a; ii+=TILE_SIZE) {
+        for(int jj=0; jj<cols_b; jj+=TILE_SIZE) {
+            for(int kk=0; kk<cols_a; kk+=TILE_SIZE) {
+    
+                for(int i=0; i<TILE_SIZE; ++i) {
+                    for(int j=0; j<TILE_SIZE; ++j) {
+                        if(ii + i < rows_a && kk + j < cols_a) {
+                            tileA[i][j] = A[(ii + i) * cols_a + (kk + j)];
+                        } else {
+                            tileA[i][j] = 0;
+                        }
+    
+                        if(kk + i < cols_a && jj + j < cols_b) {
+                            tileB[i][j] = B[(kk + i) * cols_b + (jj + j)];
+                        } else {
+                            tileB[i][j] = 0;
+                        }
+                    }
+                }
+
+#ifdef EVAL
+                unsigned long long start = __rdtsc();
+                while(__rdtsc() - start < EVAL_DPU_CC);
+#else
+                process_tile_upmem(&tileA[0][0], &tileB[0][0], &tileC[0][0], TILE_SIZE, TILE_SIZE, TILE_SIZE);
+#endif
+    
+                for(int row=0; row<TILE_SIZE; row++) {
+                    for(int col=0; col<TILE_SIZE; col++) {
+                        if(ii + row<rows_a && jj + col<cols_b) {
+                            C[(ii + row) * cols_b + (jj + col)] += tileC[row][col];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void process_tile_upmem(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b)
+{
+    unsigned int bytes_b = cols_a * cols_b * sizeof(float);
+    DPU_ASSERT(dpu_broadcast_to(dpus, "B_whole", 0, B, bytes_b, DPU_XFER_DEFAULT));
+
+    uint32_t dpu_rows_a_max = (rows_a + NUM_DPU - 1) / NUM_DPU;
+
+    uint32_t dpu_idx = 0;
+    DPU_FOREACH(dpus, dpu) {
+
+        uint32_t row_start = dpu_idx * dpu_rows_a_max;
+        uint32_t dpu_rows_a_actual = (row_start >= rows_a) ? 0
+                                   : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start)
+                                   : dpu_rows_a_max;
+
+        dpu_args_t args = {
+            .rows_a = dpu_rows_a_actual,
+            .cols_a = cols_a,
+            .cols_b = cols_b
+        };
+
+        DPU_ASSERT(dpu_copy_to(dpu, "DPU_INPUT_ARGS", 0, &args, sizeof(args)));
+
+        if(dpu_rows_a_actual) {
+            uint32_t elems_a = dpu_rows_a_actual * cols_a;
+            uint32_t bytes_a = elems_a * sizeof(float);
+
+            float *A_chunk = (float*)malloc(bytes_a);
+
+            for(int r=0; r<dpu_rows_a_actual; ++r) {
+                unsigned int global_row = row_start + r;
+                memcpy(&A_chunk[r*cols_a], &A[global_row*cols_a], cols_a*sizeof(float));
+            }
+
+            DPU_ASSERT(dpu_copy_to(dpu, "A_chunk", 0, A_chunk, bytes_a));
+            free(A_chunk);
+        }
+
+        dpu_idx++;
+    }
+
+    DPU_ASSERT(dpu_launch(dpus, DPU_SYNCHRONOUS));
+
+    DPU_FOREACH(dpus, dpu) {
+        DPU_ASSERT(dpu_log_read(dpu, stdout));
+    }
+
+    dpu_idx = 0;
+    DPU_FOREACH(dpus, dpu) {
+        
+        uint32_t row_start = dpu_idx * dpu_rows_a_max;
+        uint32_t dpu_rows_a_actual = (row_start >= rows_a) ? 0
+                                   : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start)
+                                   : dpu_rows_a_max;
+
+        if(dpu_rows_a_actual) {
+            uint32_t elems_c = dpu_rows_a_actual * cols_b;
+            uint32_t bytes_c = elems_c * sizeof(float);
+
+            float *C_chunk = (float*)malloc(bytes_c);
+
+            DPU_ASSERT(dpu_copy_from(dpu, "C_chunk", 0, C_chunk, bytes_c));
+
+            for(int r=0; r<dpu_rows_a_actual; ++r) {
+                unsigned int global_row = row_start + r;
+                memcpy(&C[global_row * cols_b], &C_chunk[r * cols_b], cols_b*sizeof(float));
+            }
+
+            free(C_chunk);
+        }
+
+        dpu_idx++;
+    }
+}
diff --git a/src/host/drand.c b/src/host/drand.c
new file mode 100644
index 0000000..88f603a
--- /dev/null
+++ b/src/host/drand.c
@@ -0,0 +1,7 @@
+#include "mlp.h"
+
+float drand()
+{
+    return (float)rand()/(float)RAND_MAX;
+}
+
diff --git a/src/get_delta.c b/src/host/get_delta.c
similarity index 80%
rename from src/get_delta.c
rename to src/host/get_delta.c
index 450129c..4126b38 100644
--- a/src/get_delta.c
+++ b/src/host/get_delta.c
@@ -1,16 +1,16 @@
 #include "mlp.h"
 
-double *get_delta(NETWORK *n, double* sample, double* ideal, int layer_index)
+float *get_delta(NETWORK *n, float* sample, float* ideal, int layer_index)
 {
     int layer_size = (n->l+layer_index)->num_neurons;
 
-    double *d = (double*) malloc (sizeof(double) * layer_size);
+    float *d = (float*) malloc (sizeof(float) * layer_size);
     if(!d) {
         fprintf(stderr, "Error 10010\n");
         return NULL;
     }
 
-    double *z = get_z(n, layer_index, sample);
+    float *z = get_z(n, layer_index, sample);
     if(!z) {
         fprintf(stderr, "Error 10011\n");
         free(d);
@@ -21,7 +21,7 @@ double *get_delta(NETWORK *n, double* sample, double* ideal, int layer_index)
 
     if(is_current_layer_last_layer)
     {
-        double *y = get_y(n, layer_index, sample);
+        float *y = get_y(n, layer_index, sample);
         if(!y) {
             fprintf(stderr, "Error 10012\n");
             free(d);
@@ -36,7 +36,7 @@ double *get_delta(NETWORK *n, double* sample, double* ideal, int layer_index)
     }
     else
     {
-        double *next_d = get_delta(n, sample, ideal, layer_index+1);
+        float *next_d = get_delta(n, sample, ideal, layer_index+1);
         if(!next_d) {
             fprintf(stderr, "Error 10013\n");
             free(d);
diff --git a/src/get_total_loss.c b/src/host/get_total_loss.c
similarity index 67%
rename from src/get_total_loss.c
rename to src/host/get_total_loss.c
index 8bf7f2c..c386536 100644
--- a/src/get_total_loss.c
+++ b/src/host/get_total_loss.c
@@ -1,8 +1,8 @@
 #include "mlp.h"
 
-double *get_total_loss(NETWORK *n, double **samples, double **ideal, int nsamples)
+float *get_total_loss(NETWORK *n, float **samples, float **ideal, int nsamples)
 {
-    double *total_loss = (double*) malloc (sizeof(double));
+    float *total_loss = (float*) malloc (sizeof(float));
     if(!total_loss) {
         fprintf(stderr, "Error 10007\n");
         return NULL;
@@ -13,13 +13,13 @@ double *get_total_loss(NETWORK *n, double **samples, double **ideal, int nsample
     LAYER *last_layer = n->l+(n->num_layers-1);
 
     for(int i=0; i<nsamples; ++i) {
-        double *y = get_y(n, n->num_layers-1, samples[i]);
+        float *y = get_y(n, n->num_layers-1, samples[i]);
         if(!y) {
             fprintf(stderr, "Error 10008\n");
             free(total_loss);
             return NULL;
         }
-        *total_loss += sse(y, ideal[i], last_layer->num_neurons) / (double)nsamples;
+        *total_loss += sse(y, ideal[i], last_layer->num_neurons) / (float)nsamples;
         free(y);
     }
 
diff --git a/src/get_y.c b/src/host/get_y.c
similarity index 78%
rename from src/get_y.c
rename to src/host/get_y.c
index 3e5b70e..5931c62 100644
--- a/src/get_y.c
+++ b/src/host/get_y.c
@@ -2,7 +2,7 @@
 
 // preactivation -> get_y -> activation
 
-double *get_y(NETWORK *n, int layer_index, double *sample)
+float *get_y(NETWORK *n, int layer_index, float *sample)
 {
     LAYER *current_layer = n->l+layer_index;
     int is_current_layer_last_layer = (n->num_layers == layer_index + 1);
@@ -11,9 +11,9 @@ double *get_y(NETWORK *n, int layer_index, double *sample)
     if(!is_current_layer_last_layer) // add bias node
         y_size++;
 
-    double *z = get_z(n, layer_index, sample);
+    float *z = get_z(n, layer_index, sample);
 
-    double *y = (double *) malloc (sizeof(double)*y_size);
+    float *y = (float *) malloc (sizeof(float)*y_size);
     if(!y) {
         fprintf(stderr, "Error 10006\n");
         return NULL;
diff --git a/src/get_z.c b/src/host/get_z.c
similarity index 75%
rename from src/get_z.c
rename to src/host/get_z.c
index ad7a08d..466ee1a 100644
--- a/src/get_z.c
+++ b/src/host/get_z.c
@@ -2,20 +2,20 @@
 
 // samples -> get_z -> preactivation
 
-double *get_z(NETWORK *n, int layer_index, double *sample)
+float *get_z(NETWORK *n, int layer_index, float *sample)
 {
     LAYER *current_layer = n->l+layer_index;
     int z_neuroncount = current_layer->num_neurons;
     int z_weightcount = current_layer->n->num_weights;
     int is_first_layer = layer_index == 0;
 
-    double *z = (double *) malloc (sizeof(double)* z_neuroncount);
+    float *z = (float *) malloc (sizeof(float)* z_neuroncount);
     if(!z) {
         fprintf(stderr, "Error 10005\n");
         return NULL;
     }
 
-    double *z_prev = is_first_layer ? sample : get_y(n, layer_index-1, sample);
+    float *z_prev = is_first_layer ? sample : get_y(n, layer_index-1, sample);
 
     for(size_t i=0; i<z_neuroncount; ++i) {
         z[i] = 0;
diff --git a/src/init_layer.c b/src/host/init_layer.c
similarity index 87%
rename from src/init_layer.c
rename to src/host/init_layer.c
index 29cc334..1828e49 100644
--- a/src/init_layer.c
+++ b/src/host/init_layer.c
@@ -9,13 +9,13 @@ LAYER *init_layer(int num_neurons, int num_weights_per_neuron, int batch_size)
 
     l->num_neurons = num_neurons;
 
-    l->inputs = (double*) malloc (batch_size * num_weights_per_neuron * sizeof(double));
+    l->inputs = (float*) malloc (batch_size * num_weights_per_neuron * sizeof(float));
     if(!l->inputs) {
         free(l);
         return NULL;
     }
 
-    l->deltas = (double*) malloc (batch_size * num_neurons * sizeof(double));
+    l->deltas = (float*) malloc (batch_size * num_neurons * sizeof(float));
     if(!l->deltas) {
         free(l->inputs);
         free(l);
diff --git a/src/init_network.c b/src/host/init_network.c
similarity index 100%
rename from src/init_network.c
rename to src/host/init_network.c
diff --git a/src/init_neuron.c b/src/host/init_neuron.c
similarity index 65%
rename from src/init_neuron.c
rename to src/host/init_neuron.c
index b5506f3..450677e 100644
--- a/src/init_neuron.c
+++ b/src/host/init_neuron.c
@@ -9,20 +9,20 @@ NEURON *init_neuron(int num_weights)
     
     n->num_weights = num_weights;
 
-    n->w    = (double *) malloc (sizeof(double) * n->num_weights);
+    n->w    = (float *) malloc (sizeof(float) * n->num_weights);
     if(!n->w) {
         free(n);
         return NULL;
     }
 
-    n->lw   = (double *) malloc (sizeof(double) * n->num_weights);
+    n->lw   = (float *) malloc (sizeof(float) * n->num_weights);
     if(!n->lw) {
         free(n->w);
         free(n);
         return NULL;
     }
 
-    n->batch_dw = (double *) malloc (sizeof(double) * n->num_weights);
+    n->batch_dw = (float *) malloc (sizeof(float) * n->num_weights);
     if(!n->batch_dw) {
         free(n->lw);
         free(n->w);
@@ -30,11 +30,11 @@ NEURON *init_neuron(int num_weights)
         return NULL;
     }
 
-    double limit = 1.0/sqrt((double) num_weights);
+    float limit = 1.0/sqrt((float) num_weights);
 
     for(int i=0; i<num_weights; i++)
     {
-        double rand_unit = (double)rand() / (double)RAND_MAX;
+        float rand_unit = (float)rand() / (float)RAND_MAX;
         n->w[i] = (rand_unit * 2.0 - 1.0) * limit;
         n->lw[i] = n->w[i];
         n->batch_dw[i] = 0;
diff --git a/src/host/matrix.c b/src/host/matrix.c
new file mode 100644
index 0000000..967ba10
--- /dev/null
+++ b/src/host/matrix.c
@@ -0,0 +1,34 @@
+#include "mlp.h"
+#include "upmem.h"
+
+void multiply_matrix(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b)
+{
+#ifdef UPMEM
+    init_dpus();
+    multiply_matrix_upmem(A, B, C, rows_a, cols_a, cols_b);
+#else
+    multiply_matrix_naive(A, B, C, rows_a, cols_a, cols_b);
+#endif
+}
+
+void multiply_matrix_naive(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b)
+{
+    for(int i=0; i<rows_a; i++) {
+        for(int j=0; j<cols_b; j++) {
+            float sum = 0.0;
+            for(int k=0; k<cols_a; k++) {
+                sum += A[i*cols_a+k] * B[k*cols_b+j];
+            }
+            C[i*cols_b+j] = sum;
+        }
+    }
+}
+
+void transpose_matrix(const float* A, float *C, int rows, int cols)
+{
+    for(int i=0; i<rows; i++) {
+        for(int j=0; j<cols; j++) {
+            C[j*rows+i] = A[i*cols+j];
+        }
+    }
+}
diff --git a/src/mlp.c b/src/host/mlp.c
similarity index 63%
rename from src/mlp.c
rename to src/host/mlp.c
index 012b37d..de56b3b 100644
--- a/src/mlp.c
+++ b/src/host/mlp.c
@@ -1,5 +1,7 @@
 #include "mlp.h"
 #include "mnist.h"
+#include "upmem.h"
+#include <x86intrin.h>
 
 unsigned int rseed = 42;
 
@@ -11,8 +13,8 @@ int main()
     int epoch = 0;
 
     int num_inputs = NUM_FEATURES;
-    int num_layers = 5;
-    int num_neurons_per_layer[] = {NUM_FEATURES, 1000, 1000, 100, NUM_LABELS};
+    int num_layers = 3;
+    int num_neurons_per_layer[] = {NUM_FEATURES, 10, NUM_LABELS};
 
     NETWORK *n = init_network(num_inputs, num_layers, num_neurons_per_layer, BATCH_SIZE);
     if(!n) {
@@ -20,16 +22,16 @@ int main()
         return 1;
     }
 
-    double **samples    = (double **) malloc (sizeof(double*)*NUM_TRAIN_SAMPLES);
-    double **labels     = (double **) malloc (sizeof(double*)*NUM_TRAIN_SAMPLES);
+    float **samples    = (float **) malloc (sizeof(float*)*NUM_TRAIN_SAMPLES);
+    float **labels     = (float **) malloc (sizeof(float*)*NUM_TRAIN_SAMPLES);
 
     uint8_t **sample_data   = read_image_data(TRAINING_SAMPLES_FILE, &sample_rows, NUM_FEATURES);
     uint8_t **label_data    = read_image_data(TRAINING_LABELS_FILE, &label_rows, 1);
 
     // save data into `samples` and `labels`
     for(size_t i=0; i<NUM_TRAIN_SAMPLES; ++i) {
-        *(samples+i)    = (double *) malloc (sizeof(double)*(NUM_FEATURES+1));
-        *(labels+i)     = (double *) malloc (sizeof(double)*NUM_LABELS);
+        *(samples+i)    = (float *) malloc (sizeof(float)*(NUM_FEATURES+1));
+        *(labels+i)     = (float *) malloc (sizeof(float)*NUM_LABELS);
         
         samples[i][0] = 0.0;   // bias
         for(size_t j=1; j<(NUM_FEATURES+1); ++j) {
@@ -44,14 +46,20 @@ int main()
     free_uint8_matrix(sample_data, sample_rows);
     free_uint8_matrix(label_data, label_rows);
 
+#ifdef UPMEM
+    printf("Run in UPMEM mode with BATCH_SIZE=%d, NUM_TRAIN_SAMPLES=%d, MAX_EPOCH=%d\n\n", BATCH_SIZE, NUM_TRAIN_SAMPLES, MAX_EPOCH);
+#else
+    printf("Run in HOST mode with BATCH_SIZE=%d, NUM_TRAIN_SAMPLES=%d, MAX_EPOCH=%d\n\n", BATCH_SIZE, NUM_TRAIN_SAMPLES, MAX_EPOCH);
+#endif
+
 #ifdef DEBUG
     // print samples & labels to check if all is saved correctly into program memory
     printf("===== Samples =====\n\n");
-    print_double_matrix(samples, 2, NUM_FEATURES+1);
+    print_float_matrix(samples, 2, NUM_FEATURES+1);
     printf("\n");
 
     printf("===== Labels =====\n\n");
-    print_double_matrix(labels, 5, NUM_LABELS);
+    print_float_matrix(labels, 5, NUM_LABELS);
     printf("\n");
 
     printf("Starting training...\n\n");
@@ -59,11 +67,15 @@ int main()
 
     int num_batches = (NUM_TRAIN_SAMPLES + BATCH_SIZE - 1) / BATCH_SIZE;
 
+#ifdef EVAL
+    unsigned long long cc_start = __rdtsc();
+#endif
+
     while(1) {
 
-        double learning_rate_epoch = LEARNING_RATE * pow(DECAY_RATE, epoch);
+        float learning_rate_epoch = LEARNING_RATE * powf(DECAY_RATE, epoch);
 
-        double *loss_prev = get_total_loss(n, samples, labels, NUM_TRAIN_SAMPLES);
+        float *loss_prev = get_total_loss(n, samples, labels, NUM_TRAIN_SAMPLES);
         if(!loss_prev) {
             fprintf(stderr, "Error 10014\n");
             return 1;
@@ -82,17 +94,17 @@ int main()
                 for(int j=n->num_layers-1; j>=0; --j) {
                     LAYER *lp = n->l+j;     // ptr to layer j of network n
 
-                    double *d = get_delta(n, samples[i], labels[i], j);
+                    float *d = get_delta(n, samples[i], labels[i], j);
 
-                    memcpy(lp->deltas+batch_ctr*lp->num_neurons, d, lp->num_neurons * sizeof(double));
+                    memcpy(lp->deltas+batch_ctr*lp->num_neurons, d, lp->num_neurons * sizeof(float));
 
-                    double *py = j ? get_y(n, j-1, samples[i]) : NULL;
+                    float *py = j ? get_y(n, j-1, samples[i]) : NULL;
                     if(j && !py) {
                         fprintf(stderr, "Error 10009\n");
                         return 1;
                     }
 
-                    memcpy(lp->inputs+batch_ctr*lp->n->num_weights, (j ? py : samples[i]), lp->n->num_weights * sizeof(double));
+                    memcpy(lp->inputs+batch_ctr*lp->n->num_weights, (j ? py : samples[i]), lp->n->num_weights * sizeof(float));
 
                     free(d);
                     if(j) free(py);
@@ -105,18 +117,18 @@ int main()
             apply_gradients(n, actual_batch_size);
         }
 
-        double *loss_new = get_total_loss(n, samples, labels, NUM_TRAIN_SAMPLES);
+        float *loss_new = get_total_loss(n, samples, labels, NUM_TRAIN_SAMPLES);
         if(!loss_new) {
             fprintf(stderr, "Error 10015\n");
             return 1;
         }
 
-        double loss_delta = fabs(*loss_new - *loss_prev);
+        float loss_delta = fabs(*loss_new - *loss_prev);
 
         epoch++;
         
 #ifdef VERBOSE
-        printf("Epoch %-3d --- Lost Delta = %.9lf --- Final Loss = %.6lf\n", epoch, loss_delta, *loss_new);
+        printf("Epoch %-3d --- Lost Delta = %.9f --- Final Loss = %.6f\n", epoch, loss_delta, *loss_new);
 #endif
 
         free(loss_prev);
@@ -126,7 +138,10 @@ int main()
             break;
     }
 
-    printf("Training complete in %d epochs\n", epoch);
+#ifdef EVAL
+    unsigned long long cc_end = __rdtsc();
+    printf("Training complete | %lld cycles | %d epochs\n", cc_end-cc_start, epoch);
+#endif
 
 #ifdef DEBUG
     printf("\n===== Weights =====\n\n");
@@ -134,16 +149,21 @@ int main()
         LAYER *lp = n->l+i;             // ptr to i-th layer of the network n
         for(int j=0; j<lp->num_neurons; j++) {
             NEURON *np = lp->n+j;       // ptr to j-th neuron of the i-th layer of network n
-            print_double_vector(np->w, np->num_weights);
+            print_float_vector(np->w, np->num_weights);
             printf("\n");
         }
         printf("\n\n");
     }
 #endif
 
+    // free DPUs if UPMEM was deployed
+    if(upmem_initialized) {
+        free_dpus();
+    }
+
     // memory cleanup before termination
-    free_double_matrix(samples, NUM_TRAIN_SAMPLES);
-    free_double_matrix(labels, NUM_TRAIN_SAMPLES);
+    free_float_matrix(samples, NUM_TRAIN_SAMPLES);
+    free_float_matrix(labels, NUM_TRAIN_SAMPLES);
     free_network(n);
 
     return 0;
diff --git a/src/read_image_data.c b/src/host/read_image_data.c
similarity index 100%
rename from src/read_image_data.c
rename to src/host/read_image_data.c
diff --git a/src/host/sse.c b/src/host/sse.c
new file mode 100644
index 0000000..cf58db9
--- /dev/null
+++ b/src/host/sse.c
@@ -0,0 +1,13 @@
+#include "mlp.h"
+
+float sse(float *real, float *ideal, int length)
+{
+    float sse = 0.0;   // Sum of squared errors
+
+    for(size_t i=0; i<length; ++i) {
+        float raw_error = ideal[i]-real[i];
+        sse += powf(raw_error, 2);
+    }
+
+    return sse;
+}
\ No newline at end of file
diff --git a/src/utils.c b/src/host/utils.c
similarity index 82%
rename from src/utils.c
rename to src/host/utils.c
index 06721ef..b28b125 100644
--- a/src/utils.c
+++ b/src/host/utils.c
@@ -48,7 +48,7 @@ void free_neuron(NEURON *n)
     n->num_weights = 0;
 }
 
-void free_double_matrix(double **addr, int nrows)
+void free_float_matrix(float **addr, int nrows)
 {
     if(!addr)
         return;
@@ -68,19 +68,19 @@ void free_uint8_matrix(uint8_t **addr, int nrows)
     free(addr);
 }
 
-void print_double_matrix(double **addr, int nrows, int ncols)
+void print_float_matrix(float **addr, int nrows, int ncols)
 {
     for(size_t i=0; i<nrows; ++i)
     {
         for(size_t j=0; j<ncols; ++j)
-            printf("%.2lf ", addr[i][j]);
+            printf("%.2f ", addr[i][j]);
         printf("\n");
     }
 }
 
-void print_double_vector(double *addr, int nrows)
+void print_float_vector(float *addr, int nrows)
 {
     for(size_t i=0; i<nrows; ++i)
-        printf("%.2lf ", addr[i]);
+        printf("%.2f ", addr[i]);
     printf("\n");
 }
\ No newline at end of file
diff --git a/src/matrix.c b/src/matrix.c
deleted file mode 100644
index 6b40dc2..0000000
--- a/src/matrix.c
+++ /dev/null
@@ -1,23 +0,0 @@
-#include "mlp.h"
-
-void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b)
-{
-    for(int i=0; i<rows_a; i++) {
-        for(int j=0; j<cols_b; j++) {
-            double sum = 0.0;
-            for(int k=0; k<cols_a; k++) {
-                sum += A[i*cols_a+k] * B[k*cols_b+j];
-            }
-            C[i*cols_b+j] = sum;
-        }
-    }
-}
-
-void transpose_matrix(const double* A, double *C, int rows, int cols)
-{
-    for(int i=0; i<rows; i++) {
-        for(int j=0; j<cols; j++) {
-            C[j*rows+i] = A[i*cols+j];
-        }
-    }
-}
diff --git a/src/sse.c b/src/sse.c
deleted file mode 100644
index ab3477b..0000000
--- a/src/sse.c
+++ /dev/null
@@ -1,13 +0,0 @@
-#include "mlp.h"
-
-double sse(double *real, double *ideal, int length)
-{
-    double sse = 0.0;   // Sum of squared errors
-
-    for(size_t i=0; i<length; ++i) {
-        double raw_error = ideal[i]-real[i];
-        sse += pow(raw_error, 2);
-    }
-
-    return sse;
-}
\ No newline at end of file
diff --git a/tests/test_accumulate_layer_gradients.c b/tests/test_accumulate_layer_gradients.c
index f1c75e5..87b7a92 100644
--- a/tests/test_accumulate_layer_gradients.c
+++ b/tests/test_accumulate_layer_gradients.c
@@ -13,13 +13,13 @@ int test_accumulate_layer_gradients()
 
     // inputs is 1x5 matrix
     for(int i=0; i<1*5; i++)
-        first_layer->inputs[i] = ((double) rand() / (double) RAND_MAX) * 20;
+        first_layer->inputs[i] = ((float) rand() / (float) RAND_MAX) * 20;
 
     // deltas is a 1x4 identity matrix
     for(int i=0; i<1*4; i++)
         first_layer->deltas[i] = 1.0;
 
-    double batch_dw_ideal[4][5] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    float batch_dw_ideal[4][5] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 
     for(int i=0; i<4; i++)
         for(int j=0; j<5; j++)
@@ -29,7 +29,7 @@ int test_accumulate_layer_gradients()
 
     for(int i=0; i<4; i++)
         for(int j=0; j<5; j++)
-            test_pass_fail &= batch_dw_ideal[i][j] == first_layer->n[i].batch_dw[j];
+            test_pass_fail &= TEST_FLOAT_EQ(batch_dw_ideal[i][j], first_layer->n[i].batch_dw[j], EPS_TEST);
 
     return test_pass_fail;
 }
diff --git a/tests/test_activation.c b/tests/test_activation.c
index 8998218..2d120bb 100644
--- a/tests/test_activation.c
+++ b/tests/test_activation.c
@@ -1,12 +1,12 @@
 #include "mlp.h"
 #include "test.h"
 
-int test_activation(double x)
+int test_activation(float x)
 {
-    double activation_result = get_activation(x);
-    double activation_derivative_result = get_activation_derivative(x);
+    float activation_result = get_activation(x);
+    float activation_derivative_result = get_activation_derivative(x);
 
-    double expected_activation_derivative = 1 - pow(activation_result, 2);
+    float expected_activation_derivative = 1 - powf(activation_result, 2);
 
     if(abs(activation_derivative_result - expected_activation_derivative) < 1e-5)
         return 1;
diff --git a/tests/test_drand.c b/tests/test_drand.c
index 1411e08..2771bb1 100644
--- a/tests/test_drand.c
+++ b/tests/test_drand.c
@@ -7,7 +7,7 @@ int test_drand()
 
     for(int i=0; i<10; i++)
     {
-        double test_value = drand();
+        float test_value = drand();
 
         test_pass_fail &= (test_value >= 0.0) && (test_value <= 1.0);
     }
diff --git a/tests/test_get_delta.c b/tests/test_get_delta.c
index b8a06c6..97e63c1 100644
--- a/tests/test_get_delta.c
+++ b/tests/test_get_delta.c
@@ -6,8 +6,8 @@ int test_get_delta()
     int test_pass_fail = 1;
 
     int num_neurons_per_layers[] = {3, 3};
-    double samples[] = {1, 1, 1, 1};
-    double ideals[] = {3, 3, 3, 3};
+    float samples[] = {1, 1, 1, 1};
+    float ideals[] = {3, 3, 3, 3};
 
     NETWORK *n = init_network(3, 2, num_neurons_per_layers, BATCH_SIZE);
 
@@ -28,20 +28,24 @@ int test_get_delta()
 
     // test last layer delta
 
-    double *d_last_layer = get_delta(n, samples, ideals, 1);
+    float *d_last_layer = get_delta(n, samples, ideals, 1);
 
     for(int i=0; i<3; i++)
     {
-        test_pass_fail &= (d_last_layer[i] == (ideals[i] - get_y(n, 1, samples)[i]) * get_activation_derivative(get_z(n, 1, samples)[i]));
+        test_pass_fail &= TEST_FLOAT_EQ(d_last_layer[i],
+                                        (ideals[i] - get_y(n, 1, samples)[i]) * get_activation_derivative(get_z(n, 1, samples)[i]),
+                                        EPS_TEST);
     }
 
     // test before-last layer delta
 
-    double *d_first_layer = get_delta(n, samples, ideals, 0);
+    float *d_first_layer = get_delta(n, samples, ideals, 0);
 
     for(int i=0; i<3; i++)
     {
-        test_pass_fail &= (d_first_layer[i] == (d_last_layer[0] + d_last_layer[1] + d_last_layer[2]) * get_activation_derivative(get_z(n, 0, samples)[i]));
+        test_pass_fail &= TEST_FLOAT_EQ(d_first_layer[i],
+                                        (d_last_layer[0] + d_last_layer[1] + d_last_layer[2]) * get_activation_derivative(get_z(n, 0, samples)[i]),
+                                        EPS_TEST);
     }
 
     return test_pass_fail;
diff --git a/tests/test_get_y.c b/tests/test_get_y.c
index 4274682..30206bd 100644
--- a/tests/test_get_y.c
+++ b/tests/test_get_y.c
@@ -4,7 +4,7 @@
 int test_get_y()
 {
     int num_neurons_per_layers[] = {3, 3};
-    double samples[] = {1, 1, 1, 1};
+    float samples[] = {1, 1, 1, 1};
 
     NETWORK *n = init_network(3, 2, num_neurons_per_layers, BATCH_SIZE);
 
@@ -23,24 +23,24 @@ int test_get_y()
     n->l[0].n[2].w[2] = 0.0;
     n->l[0].n[2].w[3] = 0.0;
 
-    double *y = get_y(n, 0, samples);
-    double *z = get_z(n, 0, samples);
+    float *y = get_y(n, 0, samples);
+    float *z = get_z(n, 0, samples);
 
-    // printf("y[0] == %.2lf\n", y[0]);
-    // printf("y[1] == %.2lf\n", y[1]);
-    // printf("y[2] == %.2lf\n", y[2]);
+    // printf("y[0] == %.2f\n", y[0]);
+    // printf("y[1] == %.2f\n", y[1]);
+    // printf("y[2] == %.2f\n", y[2]);
 
-    int test_pass_fail = (y[0] == 1)
-                        && (y[1] == get_activation(z[0]))
-                        && (y[2] == get_activation(z[1]));
+    int test_pass_fail = TEST_FLOAT_EQ(y[0], 1, EPS_TEST)
+                        && TEST_FLOAT_EQ(y[1], get_activation(z[0]), EPS_TEST)
+                        && TEST_FLOAT_EQ(y[2], get_activation(z[1]), EPS_TEST);
 
     y = get_y(n, 1, samples);
     z = get_z(n, 1, samples);
 
     test_pass_fail = test_pass_fail
-                    && (y[0] == get_activation(z[0]))
-                    && (y[1] == get_activation(z[1]))
-                    && (y[2] == get_activation(z[2]));
+                    && TEST_FLOAT_EQ(y[0], get_activation(z[0]), EPS_TEST)
+                    && TEST_FLOAT_EQ(y[1], get_activation(z[1]), EPS_TEST)
+                    && TEST_FLOAT_EQ(y[2], get_activation(z[2]), EPS_TEST);
 
     return test_pass_fail;
 }
diff --git a/tests/test_get_z.c b/tests/test_get_z.c
index be921c6..4367604 100644
--- a/tests/test_get_z.c
+++ b/tests/test_get_z.c
@@ -4,7 +4,7 @@
 int test_get_z()
 {
     int num_neurons_per_layers[] = {3, 3};
-    double samples[] = {1, 1, 1, 1};
+    float samples[] = {1, 1, 1, 1};
 
     NETWORK *n = init_network(3, 2, num_neurons_per_layers, BATCH_SIZE);
     
@@ -23,13 +23,13 @@ int test_get_z()
     n->l[0].n[2].w[2] = 0.0;
     n->l[0].n[2].w[3] = 0.0;
 
-    double *z = get_z(n, 0, samples);
+    float *z = get_z(n, 0, samples);
 
-    // printf("z[0] == %.2lf\n", z[0]);
-    // printf("z[1] == %.2lf\n", z[1]);
-    // printf("z[2] == %.2lf\n", z[2]);
+    // printf("z[0] == %.2f\n", z[0]);
+    // printf("z[1] == %.2f\n", z[1]);
+    // printf("z[2] == %.2f\n", z[2]);
 
-    int test_pass_fail = (z[0] == 2) && (z[1] == 6) && (z[2] == -1);
+    int test_pass_fail = TEST_FLOAT_EQ(z[0], 2, EPS_TEST) && TEST_FLOAT_EQ(z[1], 6, EPS_TEST) && TEST_FLOAT_EQ(z[2], -1, EPS_TEST);
 
     return test_pass_fail;
 }
diff --git a/tests/test_init_layer.c b/tests/test_init_layer.c
index 13b7ae6..761f331 100644
--- a/tests/test_init_layer.c
+++ b/tests/test_init_layer.c
@@ -6,10 +6,10 @@ int test_init_layer()
     LAYER *l = init_layer(3, 4, BATCH_SIZE);
 
     // printf("%d\n", l->num_neurons);
-    // printf("%lf\n", l->n[0].w[0]);
-    // printf("%lf\n", l->n[1].w[0]);
-    // printf("%lf\n", l->n[2].w[0]);
-    // printf("%lf\n", l->n[0].lw[0]);
+    // printf("%f\n", l->n[0].w[0]);
+    // printf("%f\n", l->n[1].w[0]);
+    // printf("%f\n", l->n[2].w[0]);
+    // printf("%f\n", l->n[0].lw[0]);
     // printf("%d\n", l->n[0].num_weights);
     // printf("%d\n", l->n[1].num_weights);
     // printf("%d\n", l->n[2].num_weights);
diff --git a/tests/test_init_network.c b/tests/test_init_network.c
index 375565f..2e5603b 100644
--- a/tests/test_init_network.c
+++ b/tests/test_init_network.c
@@ -12,7 +12,7 @@ int test_init_network()
     // printf("%d\n", n->l[1].num_neurons);
     // printf("%d\n", n->l[2].num_neurons);
     // printf("%d\n", n->l[0].n[0].num_weights);
-    // printf("%lf\n", n->l[0].n[0].lw[0]);
+    // printf("%f\n", n->l[0].n[0].lw[0]);
     // printf("%d\n", n->l[1].n[0].num_weights);
     // printf("%d\n", n->l[2].n[0].num_weights);
 
diff --git a/tests/test_init_neuron.c b/tests/test_init_neuron.c
index 61d0232..486548c 100644
--- a/tests/test_init_neuron.c
+++ b/tests/test_init_neuron.c
@@ -6,8 +6,8 @@ int test_init_neuron()
     NEURON *n = init_neuron(2);
 
     // printf("%d\n", n->num_weights);
-    // printf("%lf\n", n->w[0]);
-    // printf("%lf\n", n->lw[0]);
+    // printf("%f\n", n->w[0]);
+    // printf("%f\n", n->lw[0]);
 
     return (n->num_weights == 2) && (n->w[0] <= 1) && (n->w[0] >= -1) && (n->lw[0] == n->w[0]);
 }
diff --git a/tests/test_matrix.c b/tests/test_matrix.c
index 25323e9..37e9308 100644
--- a/tests/test_matrix.c
+++ b/tests/test_matrix.c
@@ -1,31 +1,43 @@
 #include "mlp.h"
 #include "test.h"
+#include "upmem.h"
 
 int test_multiply_matrix()
 {
     int test_result_pass_fail = 1;
 
-    double matrixA[2*3] = {1.0, 2.0, 3.0, 
+    float matrixA[2*3] = {1.0, 2.0, 3.0, 
                            0.0, 5.0, 6.0};
     
-    double matrixB[3*2] = {2.0, 6.0,
+    float matrixB[3*2] = {2.0, 6.0,
                            3.0, 3.0,
                            4.0, 0.0};
     
-    // result matrix (initialized with random double values [0.0, 20.0])
-    double matrixC[2*2];
+    // result matrices (initialized with random float values [0.0, 20.0])
+    float matrixC[2*2];
+    float matrixD[2*2];
     for(int i=0; i<2*2; i++) {
-        matrixC[i] = ((double)rand() / (double)RAND_MAX) * 20;
+        matrixC[i] = ((float)rand() / (float)RAND_MAX) * 20;
+        matrixD[i] = ((float)rand() / (float)RAND_MAX) * 20;
     }
     
     // ideal result
-    double matrixR[2*2] = {20.0, 12.0,
+    float matrixR[2*2] = {20.0, 12.0,
                            39.0, 15.0};
 
-    multiply_matrix(matrixA, matrixB, matrixC, 2, 3, 2);
+    multiply_matrix_naive(matrixA, matrixB, matrixC, 2, 3, 2);
+
+    init_dpus();
+    multiply_matrix_upmem(matrixA, matrixB, matrixD, 2, 3, 2);
+    free_dpus();
+
+    for(int i=0; i<2*2; i++) {
+        printf("%f ", matrixC[i]);
+    }
 
     for(int i=0; i<2*2; i++) {
-        test_result_pass_fail |= matrixC[i] == matrixR[i];
+        test_result_pass_fail &= matrixC[i] == matrixR[i];
+        test_result_pass_fail &= matrixC[i] == matrixD[i];
     }
 
     return test_result_pass_fail;
@@ -35,17 +47,17 @@ int test_transpose_matrix()
 {
     int test_result_pass_fail = 1;
 
-    double matrixA[2*3] = {1.0, 2.0, 3.0, 
+    float matrixA[2*3] = {1.0, 2.0, 3.0, 
                            0.0, 5.0, 6.0};
 
-    // result matrix (initialized with random double values [0.0, 20.0])
-    double matrixT[3*2];
+    // result matrix (initialized with random float values [0.0, 20.0])
+    float matrixT[3*2];
     for(int i=0; i<3*2; i++) {
-        matrixT[i] = ((double)rand() / (double)RAND_MAX) * 20;
+        matrixT[i] = ((float)rand() / (float)RAND_MAX) * 20;
     }
 
     // ideal result
-    double matrixR[3*2] = {1.0, 0.0,
+    float matrixR[3*2] = {1.0, 0.0,
                            2.0, 5.0,
                            3.0, 6.0};
 
diff --git a/tests/test_sse.c b/tests/test_sse.c
index 8f660c0..732258a 100644
--- a/tests/test_sse.c
+++ b/tests/test_sse.c
@@ -3,26 +3,26 @@
 
 int test_sse()
 {
-    double real[]   = {3, 4, 4, 4};
-    double ideal[]  = {4, 4, 4, 4};
+    float real[]   = {3, 4, 4, 4};
+    float ideal[]  = {4, 4, 4, 4};
     int test_pass_fail = 1;
 
-    double sse_result = sse(real, ideal, 4);
+    float sse_result = sse(real, ideal, 4);
 
-    test_pass_fail = test_pass_fail && (sse_result == 1);
+    test_pass_fail &= TEST_FLOAT_EQ(sse_result, 1, EPS_TEST);
 
     real[0] = 4;
     sse_result = sse(real, ideal, 4);
-    test_pass_fail = test_pass_fail && (sse_result == 0);
+    test_pass_fail &= TEST_FLOAT_EQ(sse_result, 0, EPS_TEST);
 
     real[0] = 6;
     sse_result = sse(real, ideal, 4);
-    test_pass_fail = test_pass_fail && (sse_result == 4);
+    test_pass_fail &= TEST_FLOAT_EQ(sse_result, 4, EPS_TEST);
 
     real[0] = 6;
     real[1] = 2;
     sse_result = sse(real, ideal, 4);
-    test_pass_fail = test_pass_fail && (sse_result == 8);
+    test_pass_fail &= TEST_FLOAT_EQ(sse_result, 8, EPS_TEST);
 
     return test_pass_fail;
 }