diff --git a/.github/build_upmem_toolchain.sh b/.github/build_upmem_toolchain.sh
new file mode 100644
index 0000000..cd157ec
--- /dev/null
+++ b/.github/build_upmem_toolchain.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+cd /opt/
+git clone https://github.com/kagandikmen/upmem-sdk.git
+tar -xvf upmem-sdk/2024.2.0/upmem-2024.2.0-Linux-x86_64.tar.gz
+mv upmem-2024.2.0-Linux-x86_64/ /usr/local/bin/
+rm -rf upmem-sdk/
\ No newline at end of file
diff --git a/.github/workflows/valgrind.yaml b/.github/workflows/memory_leak_tests.yaml
similarity index 50%
rename from .github/workflows/valgrind.yaml
rename to .github/workflows/memory_leak_tests.yaml
index 69cfa97..e6a8ba3 100644
--- a/.github/workflows/valgrind.yaml
+++ b/.github/workflows/memory_leak_tests.yaml
@@ -1,4 +1,4 @@
-name: Valgrind
+name: Memory Leak Tests
on:
push:
@@ -6,7 +6,7 @@ on:
jobs:
memcheck:
- runs-on: ubuntu-latest
+ runs-on: ubuntu-22.04
steps:
- name: Checkout repository
@@ -19,25 +19,40 @@ jobs:
sudo apt update
sudo apt install -y build-essential valgrind
pip3 install numpy
+ sudo bash .github/build_upmem_toolchain.sh
- name: Extract training samples & labels
run: python3 read_dataset.py
- - name: Compile MLP
- run: gcc -g -DEPSILON=0.5 -DNUM_TRAIN_SAMPLES=2 -Iinclude src/*.c -o mlp -lm
+ - name: Compile MLP without sanitizer or UPMEM
+ run: |
+ source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator
+ make SAN=0 UPMEM=0
- name: Run Valgrind
run: |
+ source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator
valgrind --leak-check=full \
--show-leak-kinds=all \
--track-origins=yes \
--error-exitcode=1 \
--log-file=valgrind.txt \
- ./mlp > /dev/null
+ ./build/mlp > /dev/null
- name: Save Valgrind log
if: always()
uses: actions/upload-artifact@v4
with:
name: valgrind_log
- path: valgrind.txt
\ No newline at end of file
+ path: valgrind.txt
+
+ - name: Compile MLP with sanitizer and UPMEM
+ run: |
+ source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator
+ make clean
+ make SAN=1 UPMEM=1
+
+ - name: Run with sanitizer
+ run: |
+ source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator
+ ./build/mlp > /dev/null
\ No newline at end of file
diff --git a/.github/workflows/unit_tests.yaml b/.github/workflows/unit_tests.yaml
index cba3dcc..897d053 100644
--- a/.github/workflows/unit_tests.yaml
+++ b/.github/workflows/unit_tests.yaml
@@ -6,7 +6,7 @@ on:
jobs:
build-and-test:
- runs-on: ubuntu-latest
+ runs-on: ubuntu-22.04
steps:
- name: Checkout repository
@@ -15,19 +15,27 @@ jobs:
submodules: 'recursive'
- name: Install dependencies
- run: sudo apt update && sudo apt install -y build-essential
+ run: |
+ sudo apt update && sudo apt install -y build-essential python3.10 python3.10-dev
+ sudo bash .github/build_upmem_toolchain.sh
- name: Create build directory
run: mkdir build
- name: Run CMake
working-directory: build
- run: cmake ..
+ run: |
+ source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh
+ cmake ..
- name: Build
working-directory: build
- run: make
+ run: |
+ source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh
+ make
- name: Run the tests
working-directory: build
- run: make test
\ No newline at end of file
+ run: |
+ source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh
+ make test
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 24a870c..274d4e3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,5 @@
matmul.c
matrices.h
-dpu/
*.o
*.out
training_images.txt
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d408d10..7f33b08 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,18 +6,44 @@ set(CMAKE_C_STANDARD_REQUIRED ON)
include_directories(include)
-file(GLOB SRC_FILES src/*.c)
-list(REMOVE_ITEM SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/mlp.c")
+file(GLOB SRC_FILES src/host/*.c)
+list(REMOVE_ITEM SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/host/mlp.c")
file(GLOB TEST_FILES tests/*.c)
+execute_process(
+ COMMAND dpu-pkg-config --cflags dpu
+ OUTPUT_VARIABLE DPU_C_FLAGS
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+execute_process(
+ COMMAND dpu-pkg-config --libs dpu
+ OUTPUT_VARIABLE DPU_LIBS
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
enable_testing()
+add_custom_target(build_dpu_program ALL
+ COMMAND dpu-upmem-dpurte-clang
+ -I${CMAKE_SOURCE_DIR}/include
+ -o ${CMAKE_BINARY_DIR}/dpu_program
+ ${CMAKE_SOURCE_DIR}/src/dpu/dpu_program.c
+)
+
+add_compile_definitions(
+ # NUM_DPU=1 Important: This macro override was commented because it does not apply to the dpu-upmem-dpurte-clang execution above; and therefore causes mismatch between
+ # dpu_program.c and the rest. So this file should avoid modifying dimensions set through macros in aforementioned header files.
+ DPU_BINARY_PATH=\"./dpu_program\"
+)
+
foreach(TEST_SRC ${TEST_FILES})
get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE)
add_executable(${TEST_NAME} ${TEST_SRC} ${SRC_FILES})
target_include_directories(${TEST_NAME} PRIVATE include)
- target_link_libraries(${TEST_NAME} m)
+ target_compile_options(${TEST_NAME} PRIVATE ${DPU_C_FLAGS})
+ target_link_libraries(${TEST_NAME} PRIVATE m ${DPU_LIBS})
add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
endforeach()
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 9c5cb69..e2f8548 100644
--- a/Makefile
+++ b/Makefile
@@ -1,17 +1,34 @@
-CLANG = dpu-upmem-dpurte-clang
-SOURCE = matmul
-CFLAGS += -O0 -DNR_TASKLETS=6
-FILESTODELETE = matmul.c dpu/
-
-all:
- python3 generate.py && \
- for test in $$(seq 0 15); do \
- $(CLANG) $(CFLAGS) -o dpu/dpu$$test/${SOURCE}.o dpu/dpu$$test/${SOURCE}.c; \
- done
- gcc --std=c99 host.c -o host.o `dpu-pkg-config --cflags --libs dpu`
+DPU_UPMEM_CLANG = dpu-upmem-dpurte-clang
+DPU_UPMEM_CFLAGS += -DNR_TASKLETS=16
-clean:
- rm -rf *.o ${FILESTODELETE}
+BATCH_SIZE ?= 20
+MAX_EPOCH ?= 10
+NUM_TRAIN_SAMPLES ?= 200
+
+CFLAGS += -std=c99 -Iinclude -D_GNU_SOURCE -DVERBOSE -DDEBUG
+CFLAGS += -DBATCH_SIZE=$(BATCH_SIZE) -DMAX_EPOCH=$(MAX_EPOCH) -DNUM_TRAIN_SAMPLES=$(NUM_TRAIN_SAMPLES)
+
+BUILD_DIR = build/
+
+UPMEM ?= 1
+ifeq ($(UPMEM), 1)
+ CFLAGS += -DUPMEM
+endif
-clean_all:
- rm -rf *.o .vscode/ .cache/ .__pycache__/ training_images.txt training_labels.txt
\ No newline at end of file
+SAN ?= 0
+ifeq ($(SAN), 1)
+ CFLAGS += -fsanitize=address,undefined,leak -fno-omit-frame-pointer -g
+endif
+
+EVAL ?= 0
+ifeq ($(EVAL), 1)
+ CFLAGS += -DEVAL
+endif
+
+all: clean
+ mkdir $(BUILD_DIR); \
+ $(DPU_UPMEM_CLANG) $(DPU_UPMEM_CFLAGS) -Iinclude -o build/dpu_program src/dpu/dpu_program.c; \
+ gcc src/host/*.c $(CFLAGS) -o build/mlp -lm `dpu-pkg-config --cflags --libs dpu`
+
+clean:
+ rm -rf $(BUILD_DIR)
diff --git a/README.md b/README.md
index a4acb69..ae97435 100644
--- a/README.md
+++ b/README.md
@@ -1,99 +1,108 @@
# UPMEM-MLP
-UPMEM-MLP is an attempt at implementing a multilayer perceptron application in pure C and accelerating this application on the UPMEM platform.
+UPMEM-MLP implements a multilayer perceptron training application in C and accelerates this application on the UPMEM platform.
-[](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/unit_tests.yaml) [](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/valgrind.yaml)
+[](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/unit_tests.yaml) [](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/memory_leak_tests.yaml)
-## Requirements
+## Prerequisites
-- GCC or Clang
- CMake 3.10 or higher
+- GCC
+- Python
- UPMEM SDK
-### Installing UPMEM SDK
+
+Installing UPMEM SDK
-To set up the UPMEM SDK on your system:
+1. Download UPMEM SDK tarball for your system from [this link](https://github.com/kagandikmen/upmem-sdk)
-1. Download UPMEM SDK tarball for your system from [this link](https://sdk.upmem.com/)
+> **NOTICE:** UPMEM SDK is no longer downloadable on UPMEM's official SDK [Downloads](https://sdk.upmem.com) page.
2. Extract its content and (preferably) move it to a better place like `/usr/local/bin/`
-3. Add the shell script `upmem_env.sh`, which sets necessary environment variables, to be sourced into your `.bashrc` as in:
+3. Add the shell script `upmem_env.sh`, which sets necessary environment variables, to be sourced into your `.bashrc`:
```bash
-source /usr/local/bin/upmem-sdk/upmem_env.sh > /dev/null
+source /usr/local/bin/upmem-sdk/upmem_env.sh simulator > /dev/null
```
4. Restart your shell session for the changes to become effective
-5. Test your setup using:
+5. Test your setup:
```bash
which dpu-lldb
```
+---
+
-which should, if correctly installed, return the path to the LLDB Debugger binary of UPMEM SDK
+## Getting Started
-## Running the Unit Tests
-
-To run the CMake test flow:
+1. Clone this repository and navigate inside it:
```bash
-mkdir build
-cd build
-cmake ..
-make
-make test
+git clone https://github.com/OpenHardware-Initiative/UPMEM-MLP.git
+cd UPMEM-MLP
```
-## Compiling the Multilayer Perceptron Natively
-
-To natively run the C multilayer perceptron on your system:
-
-1. Create a Python virtual environment (optional, but recommended) and install requirements:
+2. **(Optional, but recommended)** Create a Python virtual environment:
```bash
python3 -m venv venv
source venv/bin/activate
+```
+
+3. Install Python requirements:
+
+```bash
pip install -r requirements.txt
```
-2. Extract training samples & labels:
+4. Extract training samples & labels:
```bash
python3 read_dataset.py
```
-3. Compile the application:
+5. Compile the MLP:
```bash
-gcc -Iinclude src/*.c -o mlp -lm
+make
+```
+
+6. Run the MLP:
+
+```bash
+./build/mlp
```
With this command, you can use:
-- `-DVERBOSE` for the verbose mode, which prints loss deltas for all epochs
-- `-DDEBUG` for the debug mode, which prints a couple samples & labels at the beginning and all weights at the end
-- `-DBATCH_SIZE=...` to configure the batch size used during training
-- `-DMAX_EPOCH=...` to configure the maximum number of epochs the training can run for
-- `-DEPSILON=...` to configure epsilon from the command line
-- `-DLEARNING_RATE=...` to configure learning rate from the command line
-- `-DDECAY_RATE=...` to configure the decay rate of the learning rate
-- `-DMOMENTUM=...` to configure momentum from the command line
-- `-DNUM_TRAIN_SAMPLES=...` to configure from the command line how many samples the model should be trained with
-- `-DTRAINING_SAMPLES_FILE=...` to configure the path to the text file samples should be sourced from
-- `-DTRAINING_LABELS_FILE=...` to configure the path to the text file labels should be sourced from
+- `BATCH_SIZE=...` to configure the batch size used during training, which otherwise defaults to 20
+- `MAX_EPOCH=...` to configure the maximum number of epochs the training can run for, which otherwise defaults to 10
+- `NUM_TRAIN_SAMPLES=...` to configure from the command line how many samples the model should be trained with, which otherwise defaults to 200
+- `UPMEM=0` to turn off matrix multiplication on UPMEM
+- `SAN=1` to run the MLP with GCC sanitizer
+- `EVAL=1` to run the MLP in evaluation mode, which adds to the printout how many cycles are spent in training
-## Status
+## Running the Unit Tests
+
+UPMEM-MLP comes with unit tests, which can be found in `tests/`. Run these unit tests using:
-UPMEM-MLP is a work in progress as of 2025-11-14.
+```bash
+mkdir build
+cd build
+cmake ..
+make
+make test
+```
-### To-Do
+## Status
-- [ ] Adapt `multiply_matrix` for in-memory matrix multiplication on UPMEM
+UPMEM-MLP is completed and being actively maintained as of 2025-11-23.
## License
UPMEM-MLP is licensed under the Apache License v2.0. See [LICENSE](LICENSE) for more details.
----
\ No newline at end of file
+---
diff --git a/benchmarks.md b/benchmarks.md
new file mode 100644
index 0000000..ce5ca11
--- /dev/null
+++ b/benchmarks.md
@@ -0,0 +1,10 @@
+# Benchmark Results
+
+## NN Layout: NUM_FEATURES -> 4096 -> 4096 -> 2048 -> NUM_LABELS
+
+| BATCH_SIZE | NUM_TRAIN_SAMPLES | MAX_EPOCH | Cycles (Intel 64 Host) | Cycles (Intel 64 Host + UPMEM) |
+|------------|-------------------|-----------|------------------------|--------------------------------|
+| 1200 | 3600 | 1 | 13.05T | 12.73T |
+| 3600 | 10800 | 1 | 42.38T | 39.49T |
+
+---
diff --git a/include/mlp.h b/include/mlp.h
index b2a6616..081cea9 100644
--- a/include/mlp.h
+++ b/include/mlp.h
@@ -35,13 +35,13 @@ extern unsigned int rseed;
typedef struct {
int num_weights;
- double *w, *lw;
- double *batch_dw;
+ float *w, *lw;
+ float *batch_dw;
} NEURON;
typedef struct {
int num_neurons;
- double *inputs, *deltas;
+ float *inputs, *deltas;
NEURON *n;
} LAYER;
@@ -50,22 +50,23 @@ typedef struct {
LAYER *l;
} NETWORK;
-void accumulate_layer_gradients(LAYER *l, int batch_size, double learning_rate);
+void accumulate_layer_gradients(LAYER *l, int batch_size, float learning_rate);
void apply_gradients(NETWORK *n, int batch_size);
-double drand();
-double get_activation(double x);
-double get_activation_derivative(double x);
-double *get_delta(NETWORK *n, double *samples, double *ideal, int layer_index);
-double *get_total_loss(NETWORK *n, double **samples, double **ideal, int nsamples);
-double *get_y(NETWORK *n, int layer_index, double *sample);
-double *get_z(NETWORK *n, int layer_index, double *sample);
+float drand();
+float get_activation(float x);
+float get_activation_derivative(float x);
+float *get_delta(NETWORK *n, float *samples, float *ideal, int layer_index);
+float *get_total_loss(NETWORK *n, float **samples, float **ideal, int nsamples);
+float *get_y(NETWORK *n, int layer_index, float *sample);
+float *get_z(NETWORK *n, int layer_index, float *sample);
LAYER *init_layer(int num_neurons, int num_weights_per_neuron, int batch_size);
NETWORK *init_network(int num_inputs, int num_layers, int *num_inputs_per_layer, int batch_size);
NEURON *init_neuron(int num_weights);
-void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b);
+void multiply_matrix(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b);
+void multiply_matrix_naive(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b);
uint8_t **read_image_data(const char *filename, int *num_rows, const int num_cols);
-double sse(double *real, double *ideal, int length);
-void transpose_matrix(const double *A, double *C, int rows, int cols);
+float sse(float *real, float *ideal, int length);
+void transpose_matrix(const float *A, float *C, int rows, int cols);
//
// utility functions
@@ -75,10 +76,10 @@ void free_layer(LAYER *l);
void free_network(NETWORK *n);
void free_neuron(NEURON *n);
-void free_double_matrix(double **addr, int nrows);
+void free_float_matrix(float **addr, int nrows);
void free_uint8_matrix(uint8_t **addr, int nrows);
-void print_double_matrix(double **addr, int nrows, int ncols);
-void print_double_vector(double *addr, int nrows);
+void print_float_matrix(float **addr, int nrows, int ncols);
+void print_float_vector(float *addr, int nrows);
#endif
diff --git a/include/test.h b/include/test.h
index 5cf9797..880f862 100644
--- a/include/test.h
+++ b/include/test.h
@@ -11,4 +11,8 @@ if(test_result == 0) \
printf("PASS\n"); \
return 0; \
+#define TEST_FLOAT_EQ(v1, v2, eps) (fabsf((v1) - (v2)) < (eps))
+
+#define EPS_TEST 1e-5
+
#endif
\ No newline at end of file
diff --git a/include/upmem.h b/include/upmem.h
new file mode 100644
index 0000000..78bf601
--- /dev/null
+++ b/include/upmem.h
@@ -0,0 +1,33 @@
+#ifndef UPMEM_H
+#define UPMEM_H
+
+#include
+
+#ifndef DPU_BINARY_PATH
+#define DPU_BINARY_PATH "build/dpu_program"
+#endif
+
+#ifndef NUM_DPU
+#define NUM_DPU 32
+#endif
+
+#ifndef TILE_SIZE
+#define TILE_SIZE 512
+#endif
+
+#define EVAL_DPU_CC 458000000
+
+typedef struct {
+ uint32_t rows_a;
+ uint32_t cols_a;
+ uint32_t cols_b;
+} dpu_args_t;
+
+extern int upmem_initialized;
+
+void free_dpus();
+void init_dpus();
+void multiply_matrix_upmem(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b);
+void process_tile_upmem(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b);
+
+#endif
diff --git a/legacy/Makefile b/legacy/Makefile
new file mode 100644
index 0000000..9c5cb69
--- /dev/null
+++ b/legacy/Makefile
@@ -0,0 +1,17 @@
+CLANG = dpu-upmem-dpurte-clang
+SOURCE = matmul
+CFLAGS += -O0 -DNR_TASKLETS=6
+FILESTODELETE = matmul.c dpu/
+
+all:
+ python3 generate.py && \
+ for test in $$(seq 0 15); do \
+ $(CLANG) $(CFLAGS) -o dpu/dpu$$test/${SOURCE}.o dpu/dpu$$test/${SOURCE}.c; \
+ done
+ gcc --std=c99 host.c -o host.o `dpu-pkg-config --cflags --libs dpu`
+
+clean:
+ rm -rf *.o ${FILESTODELETE}
+
+clean_all:
+ rm -rf *.o .vscode/ .cache/ .__pycache__/ training_images.txt training_labels.txt
\ No newline at end of file
diff --git a/generate.py b/legacy/generate.py
similarity index 100%
rename from generate.py
rename to legacy/generate.py
diff --git a/host.c b/legacy/host.c
similarity index 100%
rename from host.c
rename to legacy/host.c
diff --git a/matmul.template b/legacy/matmul.template
similarity index 100%
rename from matmul.template
rename to legacy/matmul.template
diff --git a/matrices.template b/legacy/matrices.template
similarity index 100%
rename from matrices.template
rename to legacy/matrices.template
diff --git a/src/activation.c b/src/activation.c
deleted file mode 100644
index eeaaee7..0000000
--- a/src/activation.c
+++ /dev/null
@@ -1,11 +0,0 @@
-#include "mlp.h"
-
-double get_activation(double x)
-{
- return tanh(x);
-}
-
-double get_activation_derivative(double x)
-{
- return 1.0 / pow(cosh(x), 2);
-}
\ No newline at end of file
diff --git a/src/dpu/dpu_program.c b/src/dpu/dpu_program.c
new file mode 100644
index 0000000..cea4413
--- /dev/null
+++ b/src/dpu/dpu_program.c
@@ -0,0 +1,46 @@
+#include
+#include
+#include
+#include
+#include "upmem.h"
+
+__mram_noinit float A_chunk[TILE_SIZE * TILE_SIZE];
+__mram_noinit float B_whole[TILE_SIZE * TILE_SIZE];
+__mram_noinit float C_chunk[TILE_SIZE * TILE_SIZE];
+
+__host dpu_args_t DPU_INPUT_ARGS;
+
+int main()
+{
+ perfcounter_config(COUNT_CYCLES, false);
+
+ dpu_args_t dpu_input_args = DPU_INPUT_ARGS;
+ uint32_t rows_a = dpu_input_args.rows_a;
+ uint32_t cols_a = dpu_input_args.cols_a;
+ uint32_t cols_b = dpu_input_args.cols_b;
+
+ if(!rows_a)
+ return 0;
+
+ perfcounter_t cc_start = perfcounter_get();
+
+ int chunk = rows_a / NR_TASKLETS;
+ int row_start = chunk * me();
+
+ for(int i=row_start; i<(row_start+chunk); ++i) {
+ for(int j=0; jnum_neurons;
int num_weights = l->n->num_weights;
- double *gradient = (double *) malloc (num_neurons * num_weights * sizeof(double));
+ float *gradient = (float *) malloc (num_neurons * num_weights * sizeof(float));
if(!gradient) {
return;
}
- double *deltas_T = (double*) malloc (num_neurons * batch_size * sizeof(double));
+ float *deltas_T = (float*) malloc (num_neurons * batch_size * sizeof(float));
if(!deltas_T) {
free(gradient);
return;
diff --git a/src/host/activation.c b/src/host/activation.c
new file mode 100644
index 0000000..5345ec6
--- /dev/null
+++ b/src/host/activation.c
@@ -0,0 +1,11 @@
+#include "mlp.h"
+
+float get_activation(float x)
+{
+ return tanhf(x);
+}
+
+float get_activation_derivative(float x)
+{
+ return 1.0 / powf(coshf(x), 2);
+}
\ No newline at end of file
diff --git a/src/apply_gradients.c b/src/host/apply_gradients.c
similarity index 71%
rename from src/apply_gradients.c
rename to src/host/apply_gradients.c
index 4bc143b..ede95e7 100644
--- a/src/apply_gradients.c
+++ b/src/host/apply_gradients.c
@@ -15,11 +15,11 @@ void apply_gradients(NETWORK *n, int batch_size)
for(int k=0; knum_weights; k++) // do the following for all weights "k" of said neuron:
{
- double previous_weight_update = np->w[k] - np->lw[k];
- double momentum_term = MOMENTUM * previous_weight_update;
- double gradient_term = np->batch_dw[k] / (double) batch_size;
+ float previous_weight_update = np->w[k] - np->lw[k];
+ float momentum_term = MOMENTUM * previous_weight_update;
+ float gradient_term = np->batch_dw[k] / (float) batch_size;
- double old_weight = np->w[k];
+ float old_weight = np->w[k];
np->lw[k] = old_weight;
np->w[k] = old_weight + gradient_term + momentum_term;
diff --git a/src/host/dpu_host.c b/src/host/dpu_host.c
new file mode 100644
index 0000000..17bc719
--- /dev/null
+++ b/src/host/dpu_host.c
@@ -0,0 +1,149 @@
+#include
+#include
+#include
+#include
+#include "upmem.h"
+
+struct dpu_set_t dpus, dpu;
+int upmem_initialized = 0;
+
+void free_dpus()
+{
+ DPU_ASSERT(dpu_free(dpus));
+}
+
+void init_dpus()
+{
+ if(!upmem_initialized) {
+ DPU_ASSERT(dpu_alloc(NUM_DPU, NULL, &dpus));
+ DPU_ASSERT(dpu_load(dpus, DPU_BINARY_PATH, NULL));
+
+ upmem_initialized = 1;
+ }
+}
+
+void multiply_matrix_upmem(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b)
+{
+ float tileA[TILE_SIZE][TILE_SIZE];
+ float tileB[TILE_SIZE][TILE_SIZE];
+ float tileC[TILE_SIZE][TILE_SIZE];
+
+ for(int i=0; i= rows_a) ? 0
+ : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start)
+ : dpu_rows_a_max;
+
+ dpu_args_t args = {
+ .rows_a = dpu_rows_a_actual,
+ .cols_a = cols_a,
+ .cols_b = cols_b
+ };
+
+ DPU_ASSERT(dpu_copy_to(dpu, "DPU_INPUT_ARGS", 0, &args, sizeof(args)));
+
+ if(dpu_rows_a_actual) {
+ uint32_t elems_a = dpu_rows_a_actual * cols_a;
+ uint32_t bytes_a = elems_a * sizeof(float);
+
+ float *A_chunk = (float*)malloc(bytes_a);
+
+ for(int r=0; r= rows_a) ? 0
+ : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start)
+ : dpu_rows_a_max;
+
+ if(dpu_rows_a_actual) {
+ uint32_t elems_c = dpu_rows_a_actual * cols_b;
+ uint32_t bytes_c = elems_c * sizeof(float);
+
+ float *C_chunk = (float*)malloc(bytes_c);
+
+ DPU_ASSERT(dpu_copy_from(dpu, "C_chunk", 0, C_chunk, bytes_c));
+
+ for(int r=0; rl+layer_index)->num_neurons;
- double *d = (double*) malloc (sizeof(double) * layer_size);
+ float *d = (float*) malloc (sizeof(float) * layer_size);
if(!d) {
fprintf(stderr, "Error 10010\n");
return NULL;
}
- double *z = get_z(n, layer_index, sample);
+ float *z = get_z(n, layer_index, sample);
if(!z) {
fprintf(stderr, "Error 10011\n");
free(d);
@@ -21,7 +21,7 @@ double *get_delta(NETWORK *n, double* sample, double* ideal, int layer_index)
if(is_current_layer_last_layer)
{
- double *y = get_y(n, layer_index, sample);
+ float *y = get_y(n, layer_index, sample);
if(!y) {
fprintf(stderr, "Error 10012\n");
free(d);
@@ -36,7 +36,7 @@ double *get_delta(NETWORK *n, double* sample, double* ideal, int layer_index)
}
else
{
- double *next_d = get_delta(n, sample, ideal, layer_index+1);
+ float *next_d = get_delta(n, sample, ideal, layer_index+1);
if(!next_d) {
fprintf(stderr, "Error 10013\n");
free(d);
diff --git a/src/get_total_loss.c b/src/host/get_total_loss.c
similarity index 67%
rename from src/get_total_loss.c
rename to src/host/get_total_loss.c
index 8bf7f2c..c386536 100644
--- a/src/get_total_loss.c
+++ b/src/host/get_total_loss.c
@@ -1,8 +1,8 @@
#include "mlp.h"
-double *get_total_loss(NETWORK *n, double **samples, double **ideal, int nsamples)
+float *get_total_loss(NETWORK *n, float **samples, float **ideal, int nsamples)
{
- double *total_loss = (double*) malloc (sizeof(double));
+ float *total_loss = (float*) malloc (sizeof(float));
if(!total_loss) {
fprintf(stderr, "Error 10007\n");
return NULL;
@@ -13,13 +13,13 @@ double *get_total_loss(NETWORK *n, double **samples, double **ideal, int nsample
LAYER *last_layer = n->l+(n->num_layers-1);
for(int i=0; inum_layers-1, samples[i]);
+ float *y = get_y(n, n->num_layers-1, samples[i]);
if(!y) {
fprintf(stderr, "Error 10008\n");
free(total_loss);
return NULL;
}
- *total_loss += sse(y, ideal[i], last_layer->num_neurons) / (double)nsamples;
+ *total_loss += sse(y, ideal[i], last_layer->num_neurons) / (float)nsamples;
free(y);
}
diff --git a/src/get_y.c b/src/host/get_y.c
similarity index 78%
rename from src/get_y.c
rename to src/host/get_y.c
index 3e5b70e..5931c62 100644
--- a/src/get_y.c
+++ b/src/host/get_y.c
@@ -2,7 +2,7 @@
// preactivation -> get_y -> activation
-double *get_y(NETWORK *n, int layer_index, double *sample)
+float *get_y(NETWORK *n, int layer_index, float *sample)
{
LAYER *current_layer = n->l+layer_index;
int is_current_layer_last_layer = (n->num_layers == layer_index + 1);
@@ -11,9 +11,9 @@ double *get_y(NETWORK *n, int layer_index, double *sample)
if(!is_current_layer_last_layer) // add bias node
y_size++;
- double *z = get_z(n, layer_index, sample);
+ float *z = get_z(n, layer_index, sample);
- double *y = (double *) malloc (sizeof(double)*y_size);
+ float *y = (float *) malloc (sizeof(float)*y_size);
if(!y) {
fprintf(stderr, "Error 10006\n");
return NULL;
diff --git a/src/get_z.c b/src/host/get_z.c
similarity index 75%
rename from src/get_z.c
rename to src/host/get_z.c
index ad7a08d..466ee1a 100644
--- a/src/get_z.c
+++ b/src/host/get_z.c
@@ -2,20 +2,20 @@
// samples -> get_z -> preactivation
-double *get_z(NETWORK *n, int layer_index, double *sample)
+float *get_z(NETWORK *n, int layer_index, float *sample)
{
LAYER *current_layer = n->l+layer_index;
int z_neuroncount = current_layer->num_neurons;
int z_weightcount = current_layer->n->num_weights;
int is_first_layer = layer_index == 0;
- double *z = (double *) malloc (sizeof(double)* z_neuroncount);
+ float *z = (float *) malloc (sizeof(float)* z_neuroncount);
if(!z) {
fprintf(stderr, "Error 10005\n");
return NULL;
}
- double *z_prev = is_first_layer ? sample : get_y(n, layer_index-1, sample);
+ float *z_prev = is_first_layer ? sample : get_y(n, layer_index-1, sample);
for(size_t i=0; inum_neurons = num_neurons;
- l->inputs = (double*) malloc (batch_size * num_weights_per_neuron * sizeof(double));
+ l->inputs = (float*) malloc (batch_size * num_weights_per_neuron * sizeof(float));
if(!l->inputs) {
free(l);
return NULL;
}
- l->deltas = (double*) malloc (batch_size * num_neurons * sizeof(double));
+ l->deltas = (float*) malloc (batch_size * num_neurons * sizeof(float));
if(!l->deltas) {
free(l->inputs);
free(l);
diff --git a/src/init_network.c b/src/host/init_network.c
similarity index 100%
rename from src/init_network.c
rename to src/host/init_network.c
diff --git a/src/init_neuron.c b/src/host/init_neuron.c
similarity index 65%
rename from src/init_neuron.c
rename to src/host/init_neuron.c
index b5506f3..450677e 100644
--- a/src/init_neuron.c
+++ b/src/host/init_neuron.c
@@ -9,20 +9,20 @@ NEURON *init_neuron(int num_weights)
n->num_weights = num_weights;
- n->w = (double *) malloc (sizeof(double) * n->num_weights);
+ n->w = (float *) malloc (sizeof(float) * n->num_weights);
if(!n->w) {
free(n);
return NULL;
}
- n->lw = (double *) malloc (sizeof(double) * n->num_weights);
+ n->lw = (float *) malloc (sizeof(float) * n->num_weights);
if(!n->lw) {
free(n->w);
free(n);
return NULL;
}
- n->batch_dw = (double *) malloc (sizeof(double) * n->num_weights);
+ n->batch_dw = (float *) malloc (sizeof(float) * n->num_weights);
if(!n->batch_dw) {
free(n->lw);
free(n->w);
@@ -30,11 +30,11 @@ NEURON *init_neuron(int num_weights)
return NULL;
}
- double limit = 1.0/sqrt((double) num_weights);
+ float limit = 1.0/sqrt((float) num_weights);
for(int i=0; iw[i] = (rand_unit * 2.0 - 1.0) * limit;
n->lw[i] = n->w[i];
n->batch_dw[i] = 0;
diff --git a/src/host/matrix.c b/src/host/matrix.c
new file mode 100644
index 0000000..967ba10
--- /dev/null
+++ b/src/host/matrix.c
@@ -0,0 +1,34 @@
+#include "mlp.h"
+#include "upmem.h"
+
+void multiply_matrix(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b)
+{
+#ifdef UPMEM
+ init_dpus();
+ multiply_matrix_upmem(A, B, C, rows_a, cols_a, cols_b);
+#else
+ multiply_matrix_naive(A, B, C, rows_a, cols_a, cols_b);
+#endif
+}
+
+void multiply_matrix_naive(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b)
+{
+ for(int i=0; i
unsigned int rseed = 42;
@@ -11,8 +13,8 @@ int main()
int epoch = 0;
int num_inputs = NUM_FEATURES;
- int num_layers = 5;
- int num_neurons_per_layer[] = {NUM_FEATURES, 1000, 1000, 100, NUM_LABELS};
+ int num_layers = 3;
+ int num_neurons_per_layer[] = {NUM_FEATURES, 10, NUM_LABELS};
NETWORK *n = init_network(num_inputs, num_layers, num_neurons_per_layer, BATCH_SIZE);
if(!n) {
@@ -20,16 +22,16 @@ int main()
return 1;
}
- double **samples = (double **) malloc (sizeof(double*)*NUM_TRAIN_SAMPLES);
- double **labels = (double **) malloc (sizeof(double*)*NUM_TRAIN_SAMPLES);
+ float **samples = (float **) malloc (sizeof(float*)*NUM_TRAIN_SAMPLES);
+ float **labels = (float **) malloc (sizeof(float*)*NUM_TRAIN_SAMPLES);
uint8_t **sample_data = read_image_data(TRAINING_SAMPLES_FILE, &sample_rows, NUM_FEATURES);
uint8_t **label_data = read_image_data(TRAINING_LABELS_FILE, &label_rows, 1);
// save data into `samples` and `labels`
for(size_t i=0; inum_layers-1; j>=0; --j) {
LAYER *lp = n->l+j; // ptr to layer j of network n
- double *d = get_delta(n, samples[i], labels[i], j);
+ float *d = get_delta(n, samples[i], labels[i], j);
- memcpy(lp->deltas+batch_ctr*lp->num_neurons, d, lp->num_neurons * sizeof(double));
+ memcpy(lp->deltas+batch_ctr*lp->num_neurons, d, lp->num_neurons * sizeof(float));
- double *py = j ? get_y(n, j-1, samples[i]) : NULL;
+ float *py = j ? get_y(n, j-1, samples[i]) : NULL;
if(j && !py) {
fprintf(stderr, "Error 10009\n");
return 1;
}
- memcpy(lp->inputs+batch_ctr*lp->n->num_weights, (j ? py : samples[i]), lp->n->num_weights * sizeof(double));
+ memcpy(lp->inputs+batch_ctr*lp->n->num_weights, (j ? py : samples[i]), lp->n->num_weights * sizeof(float));
free(d);
if(j) free(py);
@@ -105,18 +117,18 @@ int main()
apply_gradients(n, actual_batch_size);
}
- double *loss_new = get_total_loss(n, samples, labels, NUM_TRAIN_SAMPLES);
+ float *loss_new = get_total_loss(n, samples, labels, NUM_TRAIN_SAMPLES);
if(!loss_new) {
fprintf(stderr, "Error 10015\n");
return 1;
}
- double loss_delta = fabs(*loss_new - *loss_prev);
+ float loss_delta = fabs(*loss_new - *loss_prev);
epoch++;
#ifdef VERBOSE
- printf("Epoch %-3d --- Lost Delta = %.9lf --- Final Loss = %.6lf\n", epoch, loss_delta, *loss_new);
+ printf("Epoch %-3d --- Lost Delta = %.9f --- Final Loss = %.6f\n", epoch, loss_delta, *loss_new);
#endif
free(loss_prev);
@@ -126,7 +138,10 @@ int main()
break;
}
- printf("Training complete in %d epochs\n", epoch);
+#ifdef EVAL
+ unsigned long long cc_end = __rdtsc();
+ printf("Training complete | %lld cycles | %d epochs\n", cc_end-cc_start, epoch);
+#endif
#ifdef DEBUG
printf("\n===== Weights =====\n\n");
@@ -134,16 +149,21 @@ int main()
LAYER *lp = n->l+i; // ptr to i-th layer of the network n
for(int j=0; jnum_neurons; j++) {
NEURON *np = lp->n+j; // ptr to j-th neuron of the i-th layer of network n
- print_double_vector(np->w, np->num_weights);
+ print_float_vector(np->w, np->num_weights);
printf("\n");
}
printf("\n\n");
}
#endif
+ // free DPUs if UPMEM was deployed
+ if(upmem_initialized) {
+ free_dpus();
+ }
+
// memory cleanup before termination
- free_double_matrix(samples, NUM_TRAIN_SAMPLES);
- free_double_matrix(labels, NUM_TRAIN_SAMPLES);
+ free_float_matrix(samples, NUM_TRAIN_SAMPLES);
+ free_float_matrix(labels, NUM_TRAIN_SAMPLES);
free_network(n);
return 0;
diff --git a/src/read_image_data.c b/src/host/read_image_data.c
similarity index 100%
rename from src/read_image_data.c
rename to src/host/read_image_data.c
diff --git a/src/host/sse.c b/src/host/sse.c
new file mode 100644
index 0000000..cf58db9
--- /dev/null
+++ b/src/host/sse.c
@@ -0,0 +1,13 @@
+#include "mlp.h"
+
+float sse(float *real, float *ideal, int length)
+{
+ float sse = 0.0; // Sum of squared errors
+
+ for(size_t i=0; inum_weights = 0;
}
-void free_double_matrix(double **addr, int nrows)
+void free_float_matrix(float **addr, int nrows)
{
if(!addr)
return;
@@ -68,19 +68,19 @@ void free_uint8_matrix(uint8_t **addr, int nrows)
free(addr);
}
-void print_double_matrix(double **addr, int nrows, int ncols)
+void print_float_matrix(float **addr, int nrows, int ncols)
{
for(size_t i=0; iinputs[i] = ((double) rand() / (double) RAND_MAX) * 20;
+ first_layer->inputs[i] = ((float) rand() / (float) RAND_MAX) * 20;
// deltas is a 1x4 identity matrix
for(int i=0; i<1*4; i++)
first_layer->deltas[i] = 1.0;
- double batch_dw_ideal[4][5] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ float batch_dw_ideal[4][5] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
for(int i=0; i<4; i++)
for(int j=0; j<5; j++)
@@ -29,7 +29,7 @@ int test_accumulate_layer_gradients()
for(int i=0; i<4; i++)
for(int j=0; j<5; j++)
- test_pass_fail &= batch_dw_ideal[i][j] == first_layer->n[i].batch_dw[j];
+ test_pass_fail &= TEST_FLOAT_EQ(batch_dw_ideal[i][j], first_layer->n[i].batch_dw[j], EPS_TEST);
return test_pass_fail;
}
diff --git a/tests/test_activation.c b/tests/test_activation.c
index 8998218..2d120bb 100644
--- a/tests/test_activation.c
+++ b/tests/test_activation.c
@@ -1,12 +1,12 @@
#include "mlp.h"
#include "test.h"
-int test_activation(double x)
+int test_activation(float x)
{
- double activation_result = get_activation(x);
- double activation_derivative_result = get_activation_derivative(x);
+ float activation_result = get_activation(x);
+ float activation_derivative_result = get_activation_derivative(x);
- double expected_activation_derivative = 1 - pow(activation_result, 2);
+ float expected_activation_derivative = 1 - powf(activation_result, 2);
if(abs(activation_derivative_result - expected_activation_derivative) < 1e-5)
return 1;
diff --git a/tests/test_drand.c b/tests/test_drand.c
index 1411e08..2771bb1 100644
--- a/tests/test_drand.c
+++ b/tests/test_drand.c
@@ -7,7 +7,7 @@ int test_drand()
for(int i=0; i<10; i++)
{
- double test_value = drand();
+ float test_value = drand();
test_pass_fail &= (test_value >= 0.0) && (test_value <= 1.0);
}
diff --git a/tests/test_get_delta.c b/tests/test_get_delta.c
index b8a06c6..97e63c1 100644
--- a/tests/test_get_delta.c
+++ b/tests/test_get_delta.c
@@ -6,8 +6,8 @@ int test_get_delta()
int test_pass_fail = 1;
int num_neurons_per_layers[] = {3, 3};
- double samples[] = {1, 1, 1, 1};
- double ideals[] = {3, 3, 3, 3};
+ float samples[] = {1, 1, 1, 1};
+ float ideals[] = {3, 3, 3, 3};
NETWORK *n = init_network(3, 2, num_neurons_per_layers, BATCH_SIZE);
@@ -28,20 +28,24 @@ int test_get_delta()
// test last layer delta
- double *d_last_layer = get_delta(n, samples, ideals, 1);
+ float *d_last_layer = get_delta(n, samples, ideals, 1);
for(int i=0; i<3; i++)
{
- test_pass_fail &= (d_last_layer[i] == (ideals[i] - get_y(n, 1, samples)[i]) * get_activation_derivative(get_z(n, 1, samples)[i]));
+ test_pass_fail &= TEST_FLOAT_EQ(d_last_layer[i],
+ (ideals[i] - get_y(n, 1, samples)[i]) * get_activation_derivative(get_z(n, 1, samples)[i]),
+ EPS_TEST);
}
// test before-last layer delta
- double *d_first_layer = get_delta(n, samples, ideals, 0);
+ float *d_first_layer = get_delta(n, samples, ideals, 0);
for(int i=0; i<3; i++)
{
- test_pass_fail &= (d_first_layer[i] == (d_last_layer[0] + d_last_layer[1] + d_last_layer[2]) * get_activation_derivative(get_z(n, 0, samples)[i]));
+ test_pass_fail &= TEST_FLOAT_EQ(d_first_layer[i],
+ (d_last_layer[0] + d_last_layer[1] + d_last_layer[2]) * get_activation_derivative(get_z(n, 0, samples)[i]),
+ EPS_TEST);
}
return test_pass_fail;
diff --git a/tests/test_get_y.c b/tests/test_get_y.c
index 4274682..30206bd 100644
--- a/tests/test_get_y.c
+++ b/tests/test_get_y.c
@@ -4,7 +4,7 @@
int test_get_y()
{
int num_neurons_per_layers[] = {3, 3};
- double samples[] = {1, 1, 1, 1};
+ float samples[] = {1, 1, 1, 1};
NETWORK *n = init_network(3, 2, num_neurons_per_layers, BATCH_SIZE);
@@ -23,24 +23,24 @@ int test_get_y()
n->l[0].n[2].w[2] = 0.0;
n->l[0].n[2].w[3] = 0.0;
- double *y = get_y(n, 0, samples);
- double *z = get_z(n, 0, samples);
+ float *y = get_y(n, 0, samples);
+ float *z = get_z(n, 0, samples);
- // printf("y[0] == %.2lf\n", y[0]);
- // printf("y[1] == %.2lf\n", y[1]);
- // printf("y[2] == %.2lf\n", y[2]);
+ // printf("y[0] == %.2f\n", y[0]);
+ // printf("y[1] == %.2f\n", y[1]);
+ // printf("y[2] == %.2f\n", y[2]);
- int test_pass_fail = (y[0] == 1)
- && (y[1] == get_activation(z[0]))
- && (y[2] == get_activation(z[1]));
+ int test_pass_fail = TEST_FLOAT_EQ(y[0], 1, EPS_TEST)
+ && TEST_FLOAT_EQ(y[1], get_activation(z[0]), EPS_TEST)
+ && TEST_FLOAT_EQ(y[2], get_activation(z[1]), EPS_TEST);
y = get_y(n, 1, samples);
z = get_z(n, 1, samples);
test_pass_fail = test_pass_fail
- && (y[0] == get_activation(z[0]))
- && (y[1] == get_activation(z[1]))
- && (y[2] == get_activation(z[2]));
+ && TEST_FLOAT_EQ(y[0], get_activation(z[0]), EPS_TEST)
+ && TEST_FLOAT_EQ(y[1], get_activation(z[1]), EPS_TEST)
+ && TEST_FLOAT_EQ(y[2], get_activation(z[2]), EPS_TEST);
return test_pass_fail;
}
diff --git a/tests/test_get_z.c b/tests/test_get_z.c
index be921c6..4367604 100644
--- a/tests/test_get_z.c
+++ b/tests/test_get_z.c
@@ -4,7 +4,7 @@
int test_get_z()
{
int num_neurons_per_layers[] = {3, 3};
- double samples[] = {1, 1, 1, 1};
+ float samples[] = {1, 1, 1, 1};
NETWORK *n = init_network(3, 2, num_neurons_per_layers, BATCH_SIZE);
@@ -23,13 +23,13 @@ int test_get_z()
n->l[0].n[2].w[2] = 0.0;
n->l[0].n[2].w[3] = 0.0;
- double *z = get_z(n, 0, samples);
+ float *z = get_z(n, 0, samples);
- // printf("z[0] == %.2lf\n", z[0]);
- // printf("z[1] == %.2lf\n", z[1]);
- // printf("z[2] == %.2lf\n", z[2]);
+ // printf("z[0] == %.2f\n", z[0]);
+ // printf("z[1] == %.2f\n", z[1]);
+ // printf("z[2] == %.2f\n", z[2]);
- int test_pass_fail = (z[0] == 2) && (z[1] == 6) && (z[2] == -1);
+ int test_pass_fail = TEST_FLOAT_EQ(z[0], 2, EPS_TEST) && TEST_FLOAT_EQ(z[1], 6, EPS_TEST) && TEST_FLOAT_EQ(z[2], -1, EPS_TEST);
return test_pass_fail;
}
diff --git a/tests/test_init_layer.c b/tests/test_init_layer.c
index 13b7ae6..761f331 100644
--- a/tests/test_init_layer.c
+++ b/tests/test_init_layer.c
@@ -6,10 +6,10 @@ int test_init_layer()
LAYER *l = init_layer(3, 4, BATCH_SIZE);
// printf("%d\n", l->num_neurons);
- // printf("%lf\n", l->n[0].w[0]);
- // printf("%lf\n", l->n[1].w[0]);
- // printf("%lf\n", l->n[2].w[0]);
- // printf("%lf\n", l->n[0].lw[0]);
+ // printf("%f\n", l->n[0].w[0]);
+ // printf("%f\n", l->n[1].w[0]);
+ // printf("%f\n", l->n[2].w[0]);
+ // printf("%f\n", l->n[0].lw[0]);
// printf("%d\n", l->n[0].num_weights);
// printf("%d\n", l->n[1].num_weights);
// printf("%d\n", l->n[2].num_weights);
diff --git a/tests/test_init_network.c b/tests/test_init_network.c
index 375565f..2e5603b 100644
--- a/tests/test_init_network.c
+++ b/tests/test_init_network.c
@@ -12,7 +12,7 @@ int test_init_network()
// printf("%d\n", n->l[1].num_neurons);
// printf("%d\n", n->l[2].num_neurons);
// printf("%d\n", n->l[0].n[0].num_weights);
- // printf("%lf\n", n->l[0].n[0].lw[0]);
+ // printf("%f\n", n->l[0].n[0].lw[0]);
// printf("%d\n", n->l[1].n[0].num_weights);
// printf("%d\n", n->l[2].n[0].num_weights);
diff --git a/tests/test_init_neuron.c b/tests/test_init_neuron.c
index 61d0232..486548c 100644
--- a/tests/test_init_neuron.c
+++ b/tests/test_init_neuron.c
@@ -6,8 +6,8 @@ int test_init_neuron()
NEURON *n = init_neuron(2);
// printf("%d\n", n->num_weights);
- // printf("%lf\n", n->w[0]);
- // printf("%lf\n", n->lw[0]);
+ // printf("%f\n", n->w[0]);
+ // printf("%f\n", n->lw[0]);
return (n->num_weights == 2) && (n->w[0] <= 1) && (n->w[0] >= -1) && (n->lw[0] == n->w[0]);
}
diff --git a/tests/test_matrix.c b/tests/test_matrix.c
index 25323e9..37e9308 100644
--- a/tests/test_matrix.c
+++ b/tests/test_matrix.c
@@ -1,31 +1,43 @@
#include "mlp.h"
#include "test.h"
+#include "upmem.h"
int test_multiply_matrix()
{
int test_result_pass_fail = 1;
- double matrixA[2*3] = {1.0, 2.0, 3.0,
+ float matrixA[2*3] = {1.0, 2.0, 3.0,
0.0, 5.0, 6.0};
- double matrixB[3*2] = {2.0, 6.0,
+ float matrixB[3*2] = {2.0, 6.0,
3.0, 3.0,
4.0, 0.0};
- // result matrix (initialized with random double values [0.0, 20.0])
- double matrixC[2*2];
+ // result matrices (initialized with random float values [0.0, 20.0])
+ float matrixC[2*2];
+ float matrixD[2*2];
for(int i=0; i<2*2; i++) {
- matrixC[i] = ((double)rand() / (double)RAND_MAX) * 20;
+ matrixC[i] = ((float)rand() / (float)RAND_MAX) * 20;
+ matrixD[i] = ((float)rand() / (float)RAND_MAX) * 20;
}
// ideal result
- double matrixR[2*2] = {20.0, 12.0,
+ float matrixR[2*2] = {20.0, 12.0,
39.0, 15.0};
- multiply_matrix(matrixA, matrixB, matrixC, 2, 3, 2);
+ multiply_matrix_naive(matrixA, matrixB, matrixC, 2, 3, 2);
+
+ init_dpus();
+ multiply_matrix_upmem(matrixA, matrixB, matrixD, 2, 3, 2);
+ free_dpus();
+
+ for(int i=0; i<2*2; i++) {
+ printf("%f ", matrixC[i]);
+ }
for(int i=0; i<2*2; i++) {
- test_result_pass_fail |= matrixC[i] == matrixR[i];
+ test_result_pass_fail &= matrixC[i] == matrixR[i];
+ test_result_pass_fail &= matrixC[i] == matrixD[i];
}
return test_result_pass_fail;
@@ -35,17 +47,17 @@ int test_transpose_matrix()
{
int test_result_pass_fail = 1;
- double matrixA[2*3] = {1.0, 2.0, 3.0,
+ float matrixA[2*3] = {1.0, 2.0, 3.0,
0.0, 5.0, 6.0};
- // result matrix (initialized with random double values [0.0, 20.0])
- double matrixT[3*2];
+ // result matrix (initialized with random float values [0.0, 20.0])
+ float matrixT[3*2];
for(int i=0; i<3*2; i++) {
- matrixT[i] = ((double)rand() / (double)RAND_MAX) * 20;
+ matrixT[i] = ((float)rand() / (float)RAND_MAX) * 20;
}
// ideal result
- double matrixR[3*2] = {1.0, 0.0,
+ float matrixR[3*2] = {1.0, 0.0,
2.0, 5.0,
3.0, 6.0};
diff --git a/tests/test_sse.c b/tests/test_sse.c
index 8f660c0..732258a 100644
--- a/tests/test_sse.c
+++ b/tests/test_sse.c
@@ -3,26 +3,26 @@
int test_sse()
{
- double real[] = {3, 4, 4, 4};
- double ideal[] = {4, 4, 4, 4};
+ float real[] = {3, 4, 4, 4};
+ float ideal[] = {4, 4, 4, 4};
int test_pass_fail = 1;
- double sse_result = sse(real, ideal, 4);
+ float sse_result = sse(real, ideal, 4);
- test_pass_fail = test_pass_fail && (sse_result == 1);
+ test_pass_fail &= TEST_FLOAT_EQ(sse_result, 1, EPS_TEST);
real[0] = 4;
sse_result = sse(real, ideal, 4);
- test_pass_fail = test_pass_fail && (sse_result == 0);
+ test_pass_fail &= TEST_FLOAT_EQ(sse_result, 0, EPS_TEST);
real[0] = 6;
sse_result = sse(real, ideal, 4);
- test_pass_fail = test_pass_fail && (sse_result == 4);
+ test_pass_fail &= TEST_FLOAT_EQ(sse_result, 4, EPS_TEST);
real[0] = 6;
real[1] = 2;
sse_result = sse(real, ideal, 4);
- test_pass_fail = test_pass_fail && (sse_result == 8);
+ test_pass_fail &= TEST_FLOAT_EQ(sse_result, 8, EPS_TEST);
return test_pass_fail;
}