diff --git a/.github/build_upmem_toolchain.sh b/.github/build_upmem_toolchain.sh new file mode 100644 index 0000000..cd157ec --- /dev/null +++ b/.github/build_upmem_toolchain.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +cd /opt/ +git clone https://github.com/kagandikmen/upmem-sdk.git +tar -xvf upmem-sdk/2024.2.0/upmem-2024.2.0-Linux-x86_64.tar.gz +mv upmem-2024.2.0-Linux-x86_64/ /usr/local/bin/ +rm -rf upmem-sdk/ \ No newline at end of file diff --git a/.github/workflows/valgrind.yaml b/.github/workflows/memory_leak_tests.yaml similarity index 50% rename from .github/workflows/valgrind.yaml rename to .github/workflows/memory_leak_tests.yaml index 69cfa97..e6a8ba3 100644 --- a/.github/workflows/valgrind.yaml +++ b/.github/workflows/memory_leak_tests.yaml @@ -1,4 +1,4 @@ -name: Valgrind +name: Memory Leak Tests on: push: @@ -6,7 +6,7 @@ on: jobs: memcheck: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout repository @@ -19,25 +19,40 @@ jobs: sudo apt update sudo apt install -y build-essential valgrind pip3 install numpy + sudo bash .github/build_upmem_toolchain.sh - name: Extract training samples & labels run: python3 read_dataset.py - - name: Compile MLP - run: gcc -g -DEPSILON=0.5 -DNUM_TRAIN_SAMPLES=2 -Iinclude src/*.c -o mlp -lm + - name: Compile MLP without sanitizer or UPMEM + run: | + source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator + make SAN=0 UPMEM=0 - name: Run Valgrind run: | + source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator valgrind --leak-check=full \ --show-leak-kinds=all \ --track-origins=yes \ --error-exitcode=1 \ --log-file=valgrind.txt \ - ./mlp > /dev/null + ./build/mlp > /dev/null - name: Save Valgrind log if: always() uses: actions/upload-artifact@v4 with: name: valgrind_log - path: valgrind.txt \ No newline at end of file + path: valgrind.txt + + - name: Compile MLP with sanitizer and UPMEM + run: | + source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator + make clean + make SAN=1 UPMEM=1 + + - name: Run with sanitizer + run: | + source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh simulator + ./build/mlp > /dev/null \ No newline at end of file diff --git a/.github/workflows/unit_tests.yaml b/.github/workflows/unit_tests.yaml index cba3dcc..897d053 100644 --- a/.github/workflows/unit_tests.yaml +++ b/.github/workflows/unit_tests.yaml @@ -6,7 +6,7 @@ on: jobs: build-and-test: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout repository @@ -15,19 +15,27 @@ jobs: submodules: 'recursive' - name: Install dependencies - run: sudo apt update && sudo apt install -y build-essential + run: | + sudo apt update && sudo apt install -y build-essential python3.10 python3.10-dev + sudo bash .github/build_upmem_toolchain.sh - name: Create build directory run: mkdir build - name: Run CMake working-directory: build - run: cmake .. + run: | + source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh + cmake .. - name: Build working-directory: build - run: make + run: | + source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh + make - name: Run the tests working-directory: build - run: make test \ No newline at end of file + run: | + source /usr/local/bin/upmem-2024.2.0-Linux-x86_64/upmem_env.sh + make test \ No newline at end of file diff --git a/.gitignore b/.gitignore index 24a870c..274d4e3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ matmul.c matrices.h -dpu/ *.o *.out training_images.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index d408d10..7f33b08 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,18 +6,44 @@ set(CMAKE_C_STANDARD_REQUIRED ON) include_directories(include) -file(GLOB SRC_FILES src/*.c) -list(REMOVE_ITEM SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/mlp.c") +file(GLOB SRC_FILES src/host/*.c) +list(REMOVE_ITEM SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/host/mlp.c") file(GLOB TEST_FILES tests/*.c) +execute_process( + COMMAND dpu-pkg-config --cflags dpu + OUTPUT_VARIABLE DPU_C_FLAGS + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +execute_process( + COMMAND dpu-pkg-config --libs dpu + OUTPUT_VARIABLE DPU_LIBS + OUTPUT_STRIP_TRAILING_WHITESPACE +) + enable_testing() +add_custom_target(build_dpu_program ALL + COMMAND dpu-upmem-dpurte-clang + -I${CMAKE_SOURCE_DIR}/include + -o ${CMAKE_BINARY_DIR}/dpu_program + ${CMAKE_SOURCE_DIR}/src/dpu/dpu_program.c +) + +add_compile_definitions( + # NUM_DPU=1 Important: This macro override was commented because it does not apply to the dpu-upmem-dpurte-clang execution above; and therefore causes mismatch between + # dpu_program.c and the rest. So this file should avoid modifying dimensions set through macros in aforementioned header files. + DPU_BINARY_PATH=\"./dpu_program\" +) + foreach(TEST_SRC ${TEST_FILES}) get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE) add_executable(${TEST_NAME} ${TEST_SRC} ${SRC_FILES}) target_include_directories(${TEST_NAME} PRIVATE include) - target_link_libraries(${TEST_NAME} m) + target_compile_options(${TEST_NAME} PRIVATE ${DPU_C_FLAGS}) + target_link_libraries(${TEST_NAME} PRIVATE m ${DPU_LIBS}) add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) endforeach() \ No newline at end of file diff --git a/Makefile b/Makefile index 9c5cb69..e2f8548 100644 --- a/Makefile +++ b/Makefile @@ -1,17 +1,34 @@ -CLANG = dpu-upmem-dpurte-clang -SOURCE = matmul -CFLAGS += -O0 -DNR_TASKLETS=6 -FILESTODELETE = matmul.c dpu/ - -all: - python3 generate.py && \ - for test in $$(seq 0 15); do \ - $(CLANG) $(CFLAGS) -o dpu/dpu$$test/${SOURCE}.o dpu/dpu$$test/${SOURCE}.c; \ - done - gcc --std=c99 host.c -o host.o `dpu-pkg-config --cflags --libs dpu` +DPU_UPMEM_CLANG = dpu-upmem-dpurte-clang +DPU_UPMEM_CFLAGS += -DNR_TASKLETS=16 -clean: - rm -rf *.o ${FILESTODELETE} +BATCH_SIZE ?= 20 +MAX_EPOCH ?= 10 +NUM_TRAIN_SAMPLES ?= 200 + +CFLAGS += -std=c99 -Iinclude -D_GNU_SOURCE -DVERBOSE -DDEBUG +CFLAGS += -DBATCH_SIZE=$(BATCH_SIZE) -DMAX_EPOCH=$(MAX_EPOCH) -DNUM_TRAIN_SAMPLES=$(NUM_TRAIN_SAMPLES) + +BUILD_DIR = build/ + +UPMEM ?= 1 +ifeq ($(UPMEM), 1) + CFLAGS += -DUPMEM +endif -clean_all: - rm -rf *.o .vscode/ .cache/ .__pycache__/ training_images.txt training_labels.txt \ No newline at end of file +SAN ?= 0 +ifeq ($(SAN), 1) + CFLAGS += -fsanitize=address,undefined,leak -fno-omit-frame-pointer -g +endif + +EVAL ?= 0 +ifeq ($(EVAL), 1) + CFLAGS += -DEVAL +endif + +all: clean + mkdir $(BUILD_DIR); \ + $(DPU_UPMEM_CLANG) $(DPU_UPMEM_CFLAGS) -Iinclude -o build/dpu_program src/dpu/dpu_program.c; \ + gcc src/host/*.c $(CFLAGS) -o build/mlp -lm `dpu-pkg-config --cflags --libs dpu` + +clean: + rm -rf $(BUILD_DIR) diff --git a/README.md b/README.md index a4acb69..ae97435 100644 --- a/README.md +++ b/README.md @@ -1,99 +1,108 @@ # UPMEM-MLP -UPMEM-MLP is an attempt at implementing a multilayer perceptron application in pure C and accelerating this application on the UPMEM platform. +UPMEM-MLP implements a multilayer perceptron training application in C and accelerates this application on the UPMEM platform. -[![Unit Tests](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/unit_tests.yaml/badge.svg)](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/unit_tests.yaml) [![Valgrind](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/valgrind.yaml/badge.svg)](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/valgrind.yaml) +[![Unit Tests](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/unit_tests.yaml/badge.svg)](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/unit_tests.yaml) [![Memory Leak Tests](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/memory_leak_tests.yaml/badge.svg)](https://github.com/OpenHardware-Initiative/UPMEM-MLP/actions/workflows/memory_leak_tests.yaml) -## Requirements +## Prerequisites -- GCC or Clang - CMake 3.10 or higher +- GCC +- Python - UPMEM SDK -### Installing UPMEM SDK +
+Installing UPMEM SDK
-To set up the UPMEM SDK on your system: +1. Download UPMEM SDK tarball for your system from [this link](https://github.com/kagandikmen/upmem-sdk) -1. Download UPMEM SDK tarball for your system from [this link](https://sdk.upmem.com/) +> **NOTICE:** UPMEM SDK is no longer downloadable on UPMEM's official SDK [Downloads](https://sdk.upmem.com) page. 2. Extract its content and (preferably) move it to a better place like `/usr/local/bin/` -3. Add the shell script `upmem_env.sh`, which sets necessary environment variables, to be sourced into your `.bashrc` as in: +3. Add the shell script `upmem_env.sh`, which sets necessary environment variables, to be sourced into your `.bashrc`: ```bash -source /usr/local/bin/upmem-sdk/upmem_env.sh > /dev/null +source /usr/local/bin/upmem-sdk/upmem_env.sh simulator > /dev/null ``` 4. Restart your shell session for the changes to become effective -5. Test your setup using: +5. Test your setup: ```bash which dpu-lldb ``` +--- +
-which should, if correctly installed, return the path to the LLDB Debugger binary of UPMEM SDK +## Getting Started -## Running the Unit Tests - -To run the CMake test flow: +1. Clone this repository and navigate inside it: ```bash -mkdir build -cd build -cmake .. -make -make test +git clone https://github.com/OpenHardware-Initiative/UPMEM-MLP.git +cd UPMEM-MLP ``` -## Compiling the Multilayer Perceptron Natively - -To natively run the C multilayer perceptron on your system: - -1. Create a Python virtual environment (optional, but recommended) and install requirements: +2. **(Optional, but recommended)** Create a Python virtual environment: ```bash python3 -m venv venv source venv/bin/activate +``` + +3. Install Python requirements: + +```bash pip install -r requirements.txt ``` -2. Extract training samples & labels: +4. Extract training samples & labels: ```bash python3 read_dataset.py ``` -3. Compile the application: +5. Compile the MLP: ```bash -gcc -Iinclude src/*.c -o mlp -lm +make +``` + +6. Run the MLP: + +```bash +./build/mlp ``` With this command, you can use: -- `-DVERBOSE` for the verbose mode, which prints loss deltas for all epochs -- `-DDEBUG` for the debug mode, which prints a couple samples & labels at the beginning and all weights at the end -- `-DBATCH_SIZE=...` to configure the batch size used during training -- `-DMAX_EPOCH=...` to configure the maximum number of epochs the training can run for -- `-DEPSILON=...` to configure epsilon from the command line -- `-DLEARNING_RATE=...` to configure learning rate from the command line -- `-DDECAY_RATE=...` to configure the decay rate of the learning rate -- `-DMOMENTUM=...` to configure momentum from the command line -- `-DNUM_TRAIN_SAMPLES=...` to configure from the command line how many samples the model should be trained with -- `-DTRAINING_SAMPLES_FILE=...` to configure the path to the text file samples should be sourced from -- `-DTRAINING_LABELS_FILE=...` to configure the path to the text file labels should be sourced from +- `BATCH_SIZE=...` to configure the batch size used during training, which otherwise defaults to 20 +- `MAX_EPOCH=...` to configure the maximum number of epochs the training can run for, which otherwise defaults to 10 +- `NUM_TRAIN_SAMPLES=...` to configure from the command line how many samples the model should be trained with, which otherwise defaults to 200 +- `UPMEM=0` to turn off matrix multiplication on UPMEM +- `SAN=1` to run the MLP with GCC sanitizer +- `EVAL=1` to run the MLP in evaluation mode, which adds to the printout how many cycles are spent in training -## Status +## Running the Unit Tests + +UPMEM-MLP comes with unit tests, which can be found in `tests/`. Run these unit tests using: -UPMEM-MLP is a work in progress as of 2025-11-14. +```bash +mkdir build +cd build +cmake .. +make +make test +``` -### To-Do +## Status -- [ ] Adapt `multiply_matrix` for in-memory matrix multiplication on UPMEM +UPMEM-MLP is completed and being actively maintained as of 2025-11-23. ## License UPMEM-MLP is licensed under the Apache License v2.0. See [LICENSE](LICENSE) for more details. ---- \ No newline at end of file +--- diff --git a/benchmarks.md b/benchmarks.md new file mode 100644 index 0000000..ce5ca11 --- /dev/null +++ b/benchmarks.md @@ -0,0 +1,10 @@ +# Benchmark Results + +## NN Layout: NUM_FEATURES -> 4096 -> 4096 -> 2048 -> NUM_LABELS + +| BATCH_SIZE | NUM_TRAIN_SAMPLES | MAX_EPOCH | Cycles (Intel 64 Host) | Cycles (Intel 64 Host + UPMEM) | +|------------|-------------------|-----------|------------------------|--------------------------------| +| 1200 | 3600 | 1 | 13.05T | 12.73T | +| 3600 | 10800 | 1 | 42.38T | 39.49T | + +--- diff --git a/include/mlp.h b/include/mlp.h index b2a6616..081cea9 100644 --- a/include/mlp.h +++ b/include/mlp.h @@ -35,13 +35,13 @@ extern unsigned int rseed; typedef struct { int num_weights; - double *w, *lw; - double *batch_dw; + float *w, *lw; + float *batch_dw; } NEURON; typedef struct { int num_neurons; - double *inputs, *deltas; + float *inputs, *deltas; NEURON *n; } LAYER; @@ -50,22 +50,23 @@ typedef struct { LAYER *l; } NETWORK; -void accumulate_layer_gradients(LAYER *l, int batch_size, double learning_rate); +void accumulate_layer_gradients(LAYER *l, int batch_size, float learning_rate); void apply_gradients(NETWORK *n, int batch_size); -double drand(); -double get_activation(double x); -double get_activation_derivative(double x); -double *get_delta(NETWORK *n, double *samples, double *ideal, int layer_index); -double *get_total_loss(NETWORK *n, double **samples, double **ideal, int nsamples); -double *get_y(NETWORK *n, int layer_index, double *sample); -double *get_z(NETWORK *n, int layer_index, double *sample); +float drand(); +float get_activation(float x); +float get_activation_derivative(float x); +float *get_delta(NETWORK *n, float *samples, float *ideal, int layer_index); +float *get_total_loss(NETWORK *n, float **samples, float **ideal, int nsamples); +float *get_y(NETWORK *n, int layer_index, float *sample); +float *get_z(NETWORK *n, int layer_index, float *sample); LAYER *init_layer(int num_neurons, int num_weights_per_neuron, int batch_size); NETWORK *init_network(int num_inputs, int num_layers, int *num_inputs_per_layer, int batch_size); NEURON *init_neuron(int num_weights); -void multiply_matrix(const double *A, const double *B, double *C, int rows_a, int cols_a, int cols_b); +void multiply_matrix(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b); +void multiply_matrix_naive(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b); uint8_t **read_image_data(const char *filename, int *num_rows, const int num_cols); -double sse(double *real, double *ideal, int length); -void transpose_matrix(const double *A, double *C, int rows, int cols); +float sse(float *real, float *ideal, int length); +void transpose_matrix(const float *A, float *C, int rows, int cols); // // utility functions @@ -75,10 +76,10 @@ void free_layer(LAYER *l); void free_network(NETWORK *n); void free_neuron(NEURON *n); -void free_double_matrix(double **addr, int nrows); +void free_float_matrix(float **addr, int nrows); void free_uint8_matrix(uint8_t **addr, int nrows); -void print_double_matrix(double **addr, int nrows, int ncols); -void print_double_vector(double *addr, int nrows); +void print_float_matrix(float **addr, int nrows, int ncols); +void print_float_vector(float *addr, int nrows); #endif diff --git a/include/test.h b/include/test.h index 5cf9797..880f862 100644 --- a/include/test.h +++ b/include/test.h @@ -11,4 +11,8 @@ if(test_result == 0) \ printf("PASS\n"); \ return 0; \ +#define TEST_FLOAT_EQ(v1, v2, eps) (fabsf((v1) - (v2)) < (eps)) + +#define EPS_TEST 1e-5 + #endif \ No newline at end of file diff --git a/include/upmem.h b/include/upmem.h new file mode 100644 index 0000000..78bf601 --- /dev/null +++ b/include/upmem.h @@ -0,0 +1,33 @@ +#ifndef UPMEM_H +#define UPMEM_H + +#include + +#ifndef DPU_BINARY_PATH +#define DPU_BINARY_PATH "build/dpu_program" +#endif + +#ifndef NUM_DPU +#define NUM_DPU 32 +#endif + +#ifndef TILE_SIZE +#define TILE_SIZE 512 +#endif + +#define EVAL_DPU_CC 458000000 + +typedef struct { + uint32_t rows_a; + uint32_t cols_a; + uint32_t cols_b; +} dpu_args_t; + +extern int upmem_initialized; + +void free_dpus(); +void init_dpus(); +void multiply_matrix_upmem(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b); +void process_tile_upmem(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b); + +#endif diff --git a/legacy/Makefile b/legacy/Makefile new file mode 100644 index 0000000..9c5cb69 --- /dev/null +++ b/legacy/Makefile @@ -0,0 +1,17 @@ +CLANG = dpu-upmem-dpurte-clang +SOURCE = matmul +CFLAGS += -O0 -DNR_TASKLETS=6 +FILESTODELETE = matmul.c dpu/ + +all: + python3 generate.py && \ + for test in $$(seq 0 15); do \ + $(CLANG) $(CFLAGS) -o dpu/dpu$$test/${SOURCE}.o dpu/dpu$$test/${SOURCE}.c; \ + done + gcc --std=c99 host.c -o host.o `dpu-pkg-config --cflags --libs dpu` + +clean: + rm -rf *.o ${FILESTODELETE} + +clean_all: + rm -rf *.o .vscode/ .cache/ .__pycache__/ training_images.txt training_labels.txt \ No newline at end of file diff --git a/generate.py b/legacy/generate.py similarity index 100% rename from generate.py rename to legacy/generate.py diff --git a/host.c b/legacy/host.c similarity index 100% rename from host.c rename to legacy/host.c diff --git a/matmul.template b/legacy/matmul.template similarity index 100% rename from matmul.template rename to legacy/matmul.template diff --git a/matrices.template b/legacy/matrices.template similarity index 100% rename from matrices.template rename to legacy/matrices.template diff --git a/src/activation.c b/src/activation.c deleted file mode 100644 index eeaaee7..0000000 --- a/src/activation.c +++ /dev/null @@ -1,11 +0,0 @@ -#include "mlp.h" - -double get_activation(double x) -{ - return tanh(x); -} - -double get_activation_derivative(double x) -{ - return 1.0 / pow(cosh(x), 2); -} \ No newline at end of file diff --git a/src/dpu/dpu_program.c b/src/dpu/dpu_program.c new file mode 100644 index 0000000..cea4413 --- /dev/null +++ b/src/dpu/dpu_program.c @@ -0,0 +1,46 @@ +#include +#include +#include +#include +#include "upmem.h" + +__mram_noinit float A_chunk[TILE_SIZE * TILE_SIZE]; +__mram_noinit float B_whole[TILE_SIZE * TILE_SIZE]; +__mram_noinit float C_chunk[TILE_SIZE * TILE_SIZE]; + +__host dpu_args_t DPU_INPUT_ARGS; + +int main() +{ + perfcounter_config(COUNT_CYCLES, false); + + dpu_args_t dpu_input_args = DPU_INPUT_ARGS; + uint32_t rows_a = dpu_input_args.rows_a; + uint32_t cols_a = dpu_input_args.cols_a; + uint32_t cols_b = dpu_input_args.cols_b; + + if(!rows_a) + return 0; + + perfcounter_t cc_start = perfcounter_get(); + + int chunk = rows_a / NR_TASKLETS; + int row_start = chunk * me(); + + for(int i=row_start; i<(row_start+chunk); ++i) { + for(int j=0; jnum_neurons; int num_weights = l->n->num_weights; - double *gradient = (double *) malloc (num_neurons * num_weights * sizeof(double)); + float *gradient = (float *) malloc (num_neurons * num_weights * sizeof(float)); if(!gradient) { return; } - double *deltas_T = (double*) malloc (num_neurons * batch_size * sizeof(double)); + float *deltas_T = (float*) malloc (num_neurons * batch_size * sizeof(float)); if(!deltas_T) { free(gradient); return; diff --git a/src/host/activation.c b/src/host/activation.c new file mode 100644 index 0000000..5345ec6 --- /dev/null +++ b/src/host/activation.c @@ -0,0 +1,11 @@ +#include "mlp.h" + +float get_activation(float x) +{ + return tanhf(x); +} + +float get_activation_derivative(float x) +{ + return 1.0 / powf(coshf(x), 2); +} \ No newline at end of file diff --git a/src/apply_gradients.c b/src/host/apply_gradients.c similarity index 71% rename from src/apply_gradients.c rename to src/host/apply_gradients.c index 4bc143b..ede95e7 100644 --- a/src/apply_gradients.c +++ b/src/host/apply_gradients.c @@ -15,11 +15,11 @@ void apply_gradients(NETWORK *n, int batch_size) for(int k=0; knum_weights; k++) // do the following for all weights "k" of said neuron: { - double previous_weight_update = np->w[k] - np->lw[k]; - double momentum_term = MOMENTUM * previous_weight_update; - double gradient_term = np->batch_dw[k] / (double) batch_size; + float previous_weight_update = np->w[k] - np->lw[k]; + float momentum_term = MOMENTUM * previous_weight_update; + float gradient_term = np->batch_dw[k] / (float) batch_size; - double old_weight = np->w[k]; + float old_weight = np->w[k]; np->lw[k] = old_weight; np->w[k] = old_weight + gradient_term + momentum_term; diff --git a/src/host/dpu_host.c b/src/host/dpu_host.c new file mode 100644 index 0000000..17bc719 --- /dev/null +++ b/src/host/dpu_host.c @@ -0,0 +1,149 @@ +#include +#include +#include +#include +#include "upmem.h" + +struct dpu_set_t dpus, dpu; +int upmem_initialized = 0; + +void free_dpus() +{ + DPU_ASSERT(dpu_free(dpus)); +} + +void init_dpus() +{ + if(!upmem_initialized) { + DPU_ASSERT(dpu_alloc(NUM_DPU, NULL, &dpus)); + DPU_ASSERT(dpu_load(dpus, DPU_BINARY_PATH, NULL)); + + upmem_initialized = 1; + } +} + +void multiply_matrix_upmem(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b) +{ + float tileA[TILE_SIZE][TILE_SIZE]; + float tileB[TILE_SIZE][TILE_SIZE]; + float tileC[TILE_SIZE][TILE_SIZE]; + + for(int i=0; i= rows_a) ? 0 + : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start) + : dpu_rows_a_max; + + dpu_args_t args = { + .rows_a = dpu_rows_a_actual, + .cols_a = cols_a, + .cols_b = cols_b + }; + + DPU_ASSERT(dpu_copy_to(dpu, "DPU_INPUT_ARGS", 0, &args, sizeof(args))); + + if(dpu_rows_a_actual) { + uint32_t elems_a = dpu_rows_a_actual * cols_a; + uint32_t bytes_a = elems_a * sizeof(float); + + float *A_chunk = (float*)malloc(bytes_a); + + for(int r=0; r= rows_a) ? 0 + : (dpu_rows_a_max > rows_a - row_start) ? (rows_a - row_start) + : dpu_rows_a_max; + + if(dpu_rows_a_actual) { + uint32_t elems_c = dpu_rows_a_actual * cols_b; + uint32_t bytes_c = elems_c * sizeof(float); + + float *C_chunk = (float*)malloc(bytes_c); + + DPU_ASSERT(dpu_copy_from(dpu, "C_chunk", 0, C_chunk, bytes_c)); + + for(int r=0; rl+layer_index)->num_neurons; - double *d = (double*) malloc (sizeof(double) * layer_size); + float *d = (float*) malloc (sizeof(float) * layer_size); if(!d) { fprintf(stderr, "Error 10010\n"); return NULL; } - double *z = get_z(n, layer_index, sample); + float *z = get_z(n, layer_index, sample); if(!z) { fprintf(stderr, "Error 10011\n"); free(d); @@ -21,7 +21,7 @@ double *get_delta(NETWORK *n, double* sample, double* ideal, int layer_index) if(is_current_layer_last_layer) { - double *y = get_y(n, layer_index, sample); + float *y = get_y(n, layer_index, sample); if(!y) { fprintf(stderr, "Error 10012\n"); free(d); @@ -36,7 +36,7 @@ double *get_delta(NETWORK *n, double* sample, double* ideal, int layer_index) } else { - double *next_d = get_delta(n, sample, ideal, layer_index+1); + float *next_d = get_delta(n, sample, ideal, layer_index+1); if(!next_d) { fprintf(stderr, "Error 10013\n"); free(d); diff --git a/src/get_total_loss.c b/src/host/get_total_loss.c similarity index 67% rename from src/get_total_loss.c rename to src/host/get_total_loss.c index 8bf7f2c..c386536 100644 --- a/src/get_total_loss.c +++ b/src/host/get_total_loss.c @@ -1,8 +1,8 @@ #include "mlp.h" -double *get_total_loss(NETWORK *n, double **samples, double **ideal, int nsamples) +float *get_total_loss(NETWORK *n, float **samples, float **ideal, int nsamples) { - double *total_loss = (double*) malloc (sizeof(double)); + float *total_loss = (float*) malloc (sizeof(float)); if(!total_loss) { fprintf(stderr, "Error 10007\n"); return NULL; @@ -13,13 +13,13 @@ double *get_total_loss(NETWORK *n, double **samples, double **ideal, int nsample LAYER *last_layer = n->l+(n->num_layers-1); for(int i=0; inum_layers-1, samples[i]); + float *y = get_y(n, n->num_layers-1, samples[i]); if(!y) { fprintf(stderr, "Error 10008\n"); free(total_loss); return NULL; } - *total_loss += sse(y, ideal[i], last_layer->num_neurons) / (double)nsamples; + *total_loss += sse(y, ideal[i], last_layer->num_neurons) / (float)nsamples; free(y); } diff --git a/src/get_y.c b/src/host/get_y.c similarity index 78% rename from src/get_y.c rename to src/host/get_y.c index 3e5b70e..5931c62 100644 --- a/src/get_y.c +++ b/src/host/get_y.c @@ -2,7 +2,7 @@ // preactivation -> get_y -> activation -double *get_y(NETWORK *n, int layer_index, double *sample) +float *get_y(NETWORK *n, int layer_index, float *sample) { LAYER *current_layer = n->l+layer_index; int is_current_layer_last_layer = (n->num_layers == layer_index + 1); @@ -11,9 +11,9 @@ double *get_y(NETWORK *n, int layer_index, double *sample) if(!is_current_layer_last_layer) // add bias node y_size++; - double *z = get_z(n, layer_index, sample); + float *z = get_z(n, layer_index, sample); - double *y = (double *) malloc (sizeof(double)*y_size); + float *y = (float *) malloc (sizeof(float)*y_size); if(!y) { fprintf(stderr, "Error 10006\n"); return NULL; diff --git a/src/get_z.c b/src/host/get_z.c similarity index 75% rename from src/get_z.c rename to src/host/get_z.c index ad7a08d..466ee1a 100644 --- a/src/get_z.c +++ b/src/host/get_z.c @@ -2,20 +2,20 @@ // samples -> get_z -> preactivation -double *get_z(NETWORK *n, int layer_index, double *sample) +float *get_z(NETWORK *n, int layer_index, float *sample) { LAYER *current_layer = n->l+layer_index; int z_neuroncount = current_layer->num_neurons; int z_weightcount = current_layer->n->num_weights; int is_first_layer = layer_index == 0; - double *z = (double *) malloc (sizeof(double)* z_neuroncount); + float *z = (float *) malloc (sizeof(float)* z_neuroncount); if(!z) { fprintf(stderr, "Error 10005\n"); return NULL; } - double *z_prev = is_first_layer ? sample : get_y(n, layer_index-1, sample); + float *z_prev = is_first_layer ? sample : get_y(n, layer_index-1, sample); for(size_t i=0; inum_neurons = num_neurons; - l->inputs = (double*) malloc (batch_size * num_weights_per_neuron * sizeof(double)); + l->inputs = (float*) malloc (batch_size * num_weights_per_neuron * sizeof(float)); if(!l->inputs) { free(l); return NULL; } - l->deltas = (double*) malloc (batch_size * num_neurons * sizeof(double)); + l->deltas = (float*) malloc (batch_size * num_neurons * sizeof(float)); if(!l->deltas) { free(l->inputs); free(l); diff --git a/src/init_network.c b/src/host/init_network.c similarity index 100% rename from src/init_network.c rename to src/host/init_network.c diff --git a/src/init_neuron.c b/src/host/init_neuron.c similarity index 65% rename from src/init_neuron.c rename to src/host/init_neuron.c index b5506f3..450677e 100644 --- a/src/init_neuron.c +++ b/src/host/init_neuron.c @@ -9,20 +9,20 @@ NEURON *init_neuron(int num_weights) n->num_weights = num_weights; - n->w = (double *) malloc (sizeof(double) * n->num_weights); + n->w = (float *) malloc (sizeof(float) * n->num_weights); if(!n->w) { free(n); return NULL; } - n->lw = (double *) malloc (sizeof(double) * n->num_weights); + n->lw = (float *) malloc (sizeof(float) * n->num_weights); if(!n->lw) { free(n->w); free(n); return NULL; } - n->batch_dw = (double *) malloc (sizeof(double) * n->num_weights); + n->batch_dw = (float *) malloc (sizeof(float) * n->num_weights); if(!n->batch_dw) { free(n->lw); free(n->w); @@ -30,11 +30,11 @@ NEURON *init_neuron(int num_weights) return NULL; } - double limit = 1.0/sqrt((double) num_weights); + float limit = 1.0/sqrt((float) num_weights); for(int i=0; iw[i] = (rand_unit * 2.0 - 1.0) * limit; n->lw[i] = n->w[i]; n->batch_dw[i] = 0; diff --git a/src/host/matrix.c b/src/host/matrix.c new file mode 100644 index 0000000..967ba10 --- /dev/null +++ b/src/host/matrix.c @@ -0,0 +1,34 @@ +#include "mlp.h" +#include "upmem.h" + +void multiply_matrix(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b) +{ +#ifdef UPMEM + init_dpus(); + multiply_matrix_upmem(A, B, C, rows_a, cols_a, cols_b); +#else + multiply_matrix_naive(A, B, C, rows_a, cols_a, cols_b); +#endif +} + +void multiply_matrix_naive(const float *A, const float *B, float *C, int rows_a, int cols_a, int cols_b) +{ + for(int i=0; i unsigned int rseed = 42; @@ -11,8 +13,8 @@ int main() int epoch = 0; int num_inputs = NUM_FEATURES; - int num_layers = 5; - int num_neurons_per_layer[] = {NUM_FEATURES, 1000, 1000, 100, NUM_LABELS}; + int num_layers = 3; + int num_neurons_per_layer[] = {NUM_FEATURES, 10, NUM_LABELS}; NETWORK *n = init_network(num_inputs, num_layers, num_neurons_per_layer, BATCH_SIZE); if(!n) { @@ -20,16 +22,16 @@ int main() return 1; } - double **samples = (double **) malloc (sizeof(double*)*NUM_TRAIN_SAMPLES); - double **labels = (double **) malloc (sizeof(double*)*NUM_TRAIN_SAMPLES); + float **samples = (float **) malloc (sizeof(float*)*NUM_TRAIN_SAMPLES); + float **labels = (float **) malloc (sizeof(float*)*NUM_TRAIN_SAMPLES); uint8_t **sample_data = read_image_data(TRAINING_SAMPLES_FILE, &sample_rows, NUM_FEATURES); uint8_t **label_data = read_image_data(TRAINING_LABELS_FILE, &label_rows, 1); // save data into `samples` and `labels` for(size_t i=0; inum_layers-1; j>=0; --j) { LAYER *lp = n->l+j; // ptr to layer j of network n - double *d = get_delta(n, samples[i], labels[i], j); + float *d = get_delta(n, samples[i], labels[i], j); - memcpy(lp->deltas+batch_ctr*lp->num_neurons, d, lp->num_neurons * sizeof(double)); + memcpy(lp->deltas+batch_ctr*lp->num_neurons, d, lp->num_neurons * sizeof(float)); - double *py = j ? get_y(n, j-1, samples[i]) : NULL; + float *py = j ? get_y(n, j-1, samples[i]) : NULL; if(j && !py) { fprintf(stderr, "Error 10009\n"); return 1; } - memcpy(lp->inputs+batch_ctr*lp->n->num_weights, (j ? py : samples[i]), lp->n->num_weights * sizeof(double)); + memcpy(lp->inputs+batch_ctr*lp->n->num_weights, (j ? py : samples[i]), lp->n->num_weights * sizeof(float)); free(d); if(j) free(py); @@ -105,18 +117,18 @@ int main() apply_gradients(n, actual_batch_size); } - double *loss_new = get_total_loss(n, samples, labels, NUM_TRAIN_SAMPLES); + float *loss_new = get_total_loss(n, samples, labels, NUM_TRAIN_SAMPLES); if(!loss_new) { fprintf(stderr, "Error 10015\n"); return 1; } - double loss_delta = fabs(*loss_new - *loss_prev); + float loss_delta = fabs(*loss_new - *loss_prev); epoch++; #ifdef VERBOSE - printf("Epoch %-3d --- Lost Delta = %.9lf --- Final Loss = %.6lf\n", epoch, loss_delta, *loss_new); + printf("Epoch %-3d --- Lost Delta = %.9f --- Final Loss = %.6f\n", epoch, loss_delta, *loss_new); #endif free(loss_prev); @@ -126,7 +138,10 @@ int main() break; } - printf("Training complete in %d epochs\n", epoch); +#ifdef EVAL + unsigned long long cc_end = __rdtsc(); + printf("Training complete | %lld cycles | %d epochs\n", cc_end-cc_start, epoch); +#endif #ifdef DEBUG printf("\n===== Weights =====\n\n"); @@ -134,16 +149,21 @@ int main() LAYER *lp = n->l+i; // ptr to i-th layer of the network n for(int j=0; jnum_neurons; j++) { NEURON *np = lp->n+j; // ptr to j-th neuron of the i-th layer of network n - print_double_vector(np->w, np->num_weights); + print_float_vector(np->w, np->num_weights); printf("\n"); } printf("\n\n"); } #endif + // free DPUs if UPMEM was deployed + if(upmem_initialized) { + free_dpus(); + } + // memory cleanup before termination - free_double_matrix(samples, NUM_TRAIN_SAMPLES); - free_double_matrix(labels, NUM_TRAIN_SAMPLES); + free_float_matrix(samples, NUM_TRAIN_SAMPLES); + free_float_matrix(labels, NUM_TRAIN_SAMPLES); free_network(n); return 0; diff --git a/src/read_image_data.c b/src/host/read_image_data.c similarity index 100% rename from src/read_image_data.c rename to src/host/read_image_data.c diff --git a/src/host/sse.c b/src/host/sse.c new file mode 100644 index 0000000..cf58db9 --- /dev/null +++ b/src/host/sse.c @@ -0,0 +1,13 @@ +#include "mlp.h" + +float sse(float *real, float *ideal, int length) +{ + float sse = 0.0; // Sum of squared errors + + for(size_t i=0; inum_weights = 0; } -void free_double_matrix(double **addr, int nrows) +void free_float_matrix(float **addr, int nrows) { if(!addr) return; @@ -68,19 +68,19 @@ void free_uint8_matrix(uint8_t **addr, int nrows) free(addr); } -void print_double_matrix(double **addr, int nrows, int ncols) +void print_float_matrix(float **addr, int nrows, int ncols) { for(size_t i=0; iinputs[i] = ((double) rand() / (double) RAND_MAX) * 20; + first_layer->inputs[i] = ((float) rand() / (float) RAND_MAX) * 20; // deltas is a 1x4 identity matrix for(int i=0; i<1*4; i++) first_layer->deltas[i] = 1.0; - double batch_dw_ideal[4][5] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + float batch_dw_ideal[4][5] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; for(int i=0; i<4; i++) for(int j=0; j<5; j++) @@ -29,7 +29,7 @@ int test_accumulate_layer_gradients() for(int i=0; i<4; i++) for(int j=0; j<5; j++) - test_pass_fail &= batch_dw_ideal[i][j] == first_layer->n[i].batch_dw[j]; + test_pass_fail &= TEST_FLOAT_EQ(batch_dw_ideal[i][j], first_layer->n[i].batch_dw[j], EPS_TEST); return test_pass_fail; } diff --git a/tests/test_activation.c b/tests/test_activation.c index 8998218..2d120bb 100644 --- a/tests/test_activation.c +++ b/tests/test_activation.c @@ -1,12 +1,12 @@ #include "mlp.h" #include "test.h" -int test_activation(double x) +int test_activation(float x) { - double activation_result = get_activation(x); - double activation_derivative_result = get_activation_derivative(x); + float activation_result = get_activation(x); + float activation_derivative_result = get_activation_derivative(x); - double expected_activation_derivative = 1 - pow(activation_result, 2); + float expected_activation_derivative = 1 - powf(activation_result, 2); if(abs(activation_derivative_result - expected_activation_derivative) < 1e-5) return 1; diff --git a/tests/test_drand.c b/tests/test_drand.c index 1411e08..2771bb1 100644 --- a/tests/test_drand.c +++ b/tests/test_drand.c @@ -7,7 +7,7 @@ int test_drand() for(int i=0; i<10; i++) { - double test_value = drand(); + float test_value = drand(); test_pass_fail &= (test_value >= 0.0) && (test_value <= 1.0); } diff --git a/tests/test_get_delta.c b/tests/test_get_delta.c index b8a06c6..97e63c1 100644 --- a/tests/test_get_delta.c +++ b/tests/test_get_delta.c @@ -6,8 +6,8 @@ int test_get_delta() int test_pass_fail = 1; int num_neurons_per_layers[] = {3, 3}; - double samples[] = {1, 1, 1, 1}; - double ideals[] = {3, 3, 3, 3}; + float samples[] = {1, 1, 1, 1}; + float ideals[] = {3, 3, 3, 3}; NETWORK *n = init_network(3, 2, num_neurons_per_layers, BATCH_SIZE); @@ -28,20 +28,24 @@ int test_get_delta() // test last layer delta - double *d_last_layer = get_delta(n, samples, ideals, 1); + float *d_last_layer = get_delta(n, samples, ideals, 1); for(int i=0; i<3; i++) { - test_pass_fail &= (d_last_layer[i] == (ideals[i] - get_y(n, 1, samples)[i]) * get_activation_derivative(get_z(n, 1, samples)[i])); + test_pass_fail &= TEST_FLOAT_EQ(d_last_layer[i], + (ideals[i] - get_y(n, 1, samples)[i]) * get_activation_derivative(get_z(n, 1, samples)[i]), + EPS_TEST); } // test before-last layer delta - double *d_first_layer = get_delta(n, samples, ideals, 0); + float *d_first_layer = get_delta(n, samples, ideals, 0); for(int i=0; i<3; i++) { - test_pass_fail &= (d_first_layer[i] == (d_last_layer[0] + d_last_layer[1] + d_last_layer[2]) * get_activation_derivative(get_z(n, 0, samples)[i])); + test_pass_fail &= TEST_FLOAT_EQ(d_first_layer[i], + (d_last_layer[0] + d_last_layer[1] + d_last_layer[2]) * get_activation_derivative(get_z(n, 0, samples)[i]), + EPS_TEST); } return test_pass_fail; diff --git a/tests/test_get_y.c b/tests/test_get_y.c index 4274682..30206bd 100644 --- a/tests/test_get_y.c +++ b/tests/test_get_y.c @@ -4,7 +4,7 @@ int test_get_y() { int num_neurons_per_layers[] = {3, 3}; - double samples[] = {1, 1, 1, 1}; + float samples[] = {1, 1, 1, 1}; NETWORK *n = init_network(3, 2, num_neurons_per_layers, BATCH_SIZE); @@ -23,24 +23,24 @@ int test_get_y() n->l[0].n[2].w[2] = 0.0; n->l[0].n[2].w[3] = 0.0; - double *y = get_y(n, 0, samples); - double *z = get_z(n, 0, samples); + float *y = get_y(n, 0, samples); + float *z = get_z(n, 0, samples); - // printf("y[0] == %.2lf\n", y[0]); - // printf("y[1] == %.2lf\n", y[1]); - // printf("y[2] == %.2lf\n", y[2]); + // printf("y[0] == %.2f\n", y[0]); + // printf("y[1] == %.2f\n", y[1]); + // printf("y[2] == %.2f\n", y[2]); - int test_pass_fail = (y[0] == 1) - && (y[1] == get_activation(z[0])) - && (y[2] == get_activation(z[1])); + int test_pass_fail = TEST_FLOAT_EQ(y[0], 1, EPS_TEST) + && TEST_FLOAT_EQ(y[1], get_activation(z[0]), EPS_TEST) + && TEST_FLOAT_EQ(y[2], get_activation(z[1]), EPS_TEST); y = get_y(n, 1, samples); z = get_z(n, 1, samples); test_pass_fail = test_pass_fail - && (y[0] == get_activation(z[0])) - && (y[1] == get_activation(z[1])) - && (y[2] == get_activation(z[2])); + && TEST_FLOAT_EQ(y[0], get_activation(z[0]), EPS_TEST) + && TEST_FLOAT_EQ(y[1], get_activation(z[1]), EPS_TEST) + && TEST_FLOAT_EQ(y[2], get_activation(z[2]), EPS_TEST); return test_pass_fail; } diff --git a/tests/test_get_z.c b/tests/test_get_z.c index be921c6..4367604 100644 --- a/tests/test_get_z.c +++ b/tests/test_get_z.c @@ -4,7 +4,7 @@ int test_get_z() { int num_neurons_per_layers[] = {3, 3}; - double samples[] = {1, 1, 1, 1}; + float samples[] = {1, 1, 1, 1}; NETWORK *n = init_network(3, 2, num_neurons_per_layers, BATCH_SIZE); @@ -23,13 +23,13 @@ int test_get_z() n->l[0].n[2].w[2] = 0.0; n->l[0].n[2].w[3] = 0.0; - double *z = get_z(n, 0, samples); + float *z = get_z(n, 0, samples); - // printf("z[0] == %.2lf\n", z[0]); - // printf("z[1] == %.2lf\n", z[1]); - // printf("z[2] == %.2lf\n", z[2]); + // printf("z[0] == %.2f\n", z[0]); + // printf("z[1] == %.2f\n", z[1]); + // printf("z[2] == %.2f\n", z[2]); - int test_pass_fail = (z[0] == 2) && (z[1] == 6) && (z[2] == -1); + int test_pass_fail = TEST_FLOAT_EQ(z[0], 2, EPS_TEST) && TEST_FLOAT_EQ(z[1], 6, EPS_TEST) && TEST_FLOAT_EQ(z[2], -1, EPS_TEST); return test_pass_fail; } diff --git a/tests/test_init_layer.c b/tests/test_init_layer.c index 13b7ae6..761f331 100644 --- a/tests/test_init_layer.c +++ b/tests/test_init_layer.c @@ -6,10 +6,10 @@ int test_init_layer() LAYER *l = init_layer(3, 4, BATCH_SIZE); // printf("%d\n", l->num_neurons); - // printf("%lf\n", l->n[0].w[0]); - // printf("%lf\n", l->n[1].w[0]); - // printf("%lf\n", l->n[2].w[0]); - // printf("%lf\n", l->n[0].lw[0]); + // printf("%f\n", l->n[0].w[0]); + // printf("%f\n", l->n[1].w[0]); + // printf("%f\n", l->n[2].w[0]); + // printf("%f\n", l->n[0].lw[0]); // printf("%d\n", l->n[0].num_weights); // printf("%d\n", l->n[1].num_weights); // printf("%d\n", l->n[2].num_weights); diff --git a/tests/test_init_network.c b/tests/test_init_network.c index 375565f..2e5603b 100644 --- a/tests/test_init_network.c +++ b/tests/test_init_network.c @@ -12,7 +12,7 @@ int test_init_network() // printf("%d\n", n->l[1].num_neurons); // printf("%d\n", n->l[2].num_neurons); // printf("%d\n", n->l[0].n[0].num_weights); - // printf("%lf\n", n->l[0].n[0].lw[0]); + // printf("%f\n", n->l[0].n[0].lw[0]); // printf("%d\n", n->l[1].n[0].num_weights); // printf("%d\n", n->l[2].n[0].num_weights); diff --git a/tests/test_init_neuron.c b/tests/test_init_neuron.c index 61d0232..486548c 100644 --- a/tests/test_init_neuron.c +++ b/tests/test_init_neuron.c @@ -6,8 +6,8 @@ int test_init_neuron() NEURON *n = init_neuron(2); // printf("%d\n", n->num_weights); - // printf("%lf\n", n->w[0]); - // printf("%lf\n", n->lw[0]); + // printf("%f\n", n->w[0]); + // printf("%f\n", n->lw[0]); return (n->num_weights == 2) && (n->w[0] <= 1) && (n->w[0] >= -1) && (n->lw[0] == n->w[0]); } diff --git a/tests/test_matrix.c b/tests/test_matrix.c index 25323e9..37e9308 100644 --- a/tests/test_matrix.c +++ b/tests/test_matrix.c @@ -1,31 +1,43 @@ #include "mlp.h" #include "test.h" +#include "upmem.h" int test_multiply_matrix() { int test_result_pass_fail = 1; - double matrixA[2*3] = {1.0, 2.0, 3.0, + float matrixA[2*3] = {1.0, 2.0, 3.0, 0.0, 5.0, 6.0}; - double matrixB[3*2] = {2.0, 6.0, + float matrixB[3*2] = {2.0, 6.0, 3.0, 3.0, 4.0, 0.0}; - // result matrix (initialized with random double values [0.0, 20.0]) - double matrixC[2*2]; + // result matrices (initialized with random float values [0.0, 20.0]) + float matrixC[2*2]; + float matrixD[2*2]; for(int i=0; i<2*2; i++) { - matrixC[i] = ((double)rand() / (double)RAND_MAX) * 20; + matrixC[i] = ((float)rand() / (float)RAND_MAX) * 20; + matrixD[i] = ((float)rand() / (float)RAND_MAX) * 20; } // ideal result - double matrixR[2*2] = {20.0, 12.0, + float matrixR[2*2] = {20.0, 12.0, 39.0, 15.0}; - multiply_matrix(matrixA, matrixB, matrixC, 2, 3, 2); + multiply_matrix_naive(matrixA, matrixB, matrixC, 2, 3, 2); + + init_dpus(); + multiply_matrix_upmem(matrixA, matrixB, matrixD, 2, 3, 2); + free_dpus(); + + for(int i=0; i<2*2; i++) { + printf("%f ", matrixC[i]); + } for(int i=0; i<2*2; i++) { - test_result_pass_fail |= matrixC[i] == matrixR[i]; + test_result_pass_fail &= matrixC[i] == matrixR[i]; + test_result_pass_fail &= matrixC[i] == matrixD[i]; } return test_result_pass_fail; @@ -35,17 +47,17 @@ int test_transpose_matrix() { int test_result_pass_fail = 1; - double matrixA[2*3] = {1.0, 2.0, 3.0, + float matrixA[2*3] = {1.0, 2.0, 3.0, 0.0, 5.0, 6.0}; - // result matrix (initialized with random double values [0.0, 20.0]) - double matrixT[3*2]; + // result matrix (initialized with random float values [0.0, 20.0]) + float matrixT[3*2]; for(int i=0; i<3*2; i++) { - matrixT[i] = ((double)rand() / (double)RAND_MAX) * 20; + matrixT[i] = ((float)rand() / (float)RAND_MAX) * 20; } // ideal result - double matrixR[3*2] = {1.0, 0.0, + float matrixR[3*2] = {1.0, 0.0, 2.0, 5.0, 3.0, 6.0}; diff --git a/tests/test_sse.c b/tests/test_sse.c index 8f660c0..732258a 100644 --- a/tests/test_sse.c +++ b/tests/test_sse.c @@ -3,26 +3,26 @@ int test_sse() { - double real[] = {3, 4, 4, 4}; - double ideal[] = {4, 4, 4, 4}; + float real[] = {3, 4, 4, 4}; + float ideal[] = {4, 4, 4, 4}; int test_pass_fail = 1; - double sse_result = sse(real, ideal, 4); + float sse_result = sse(real, ideal, 4); - test_pass_fail = test_pass_fail && (sse_result == 1); + test_pass_fail &= TEST_FLOAT_EQ(sse_result, 1, EPS_TEST); real[0] = 4; sse_result = sse(real, ideal, 4); - test_pass_fail = test_pass_fail && (sse_result == 0); + test_pass_fail &= TEST_FLOAT_EQ(sse_result, 0, EPS_TEST); real[0] = 6; sse_result = sse(real, ideal, 4); - test_pass_fail = test_pass_fail && (sse_result == 4); + test_pass_fail &= TEST_FLOAT_EQ(sse_result, 4, EPS_TEST); real[0] = 6; real[1] = 2; sse_result = sse(real, ideal, 4); - test_pass_fail = test_pass_fail && (sse_result == 8); + test_pass_fail &= TEST_FLOAT_EQ(sse_result, 8, EPS_TEST); return test_pass_fail; }