Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,17 +1,26 @@
cmake_minimum_required(VERSION 3.5)
project(fdct)

add_compile_options(-Wall)

# AVX2
add_compile_options(-mavx2)

# OpenMP
find_package(OpenMP REQUIRED)
add_compile_options(${OpenMP_CXX_FLAGS})

# Find Python
find_package(PythonLibs REQUIRED)
include_directories(${PYTHON_INCLUDE_DIRS})

# Find Pybind11
find_package(pybind11 REQUIRED)

find_package(PkgConfig REQUIRED)
pkg_search_module(FFTW REQUIRED fftw3 IMPORTED_TARGET)
include_directories(PkgConfig::FFTW)
link_libraries(PkgConfig::FFTW)
# find_package(PkgConfig REQUIRED)
# pkg_search_module(FFTW REQUIRED fftw3 IMPORTED_TARGET)
# include_directories(PkgConfig::FFTW)
# link_libraries(PkgConfig::FFTW)

pybind11_add_module(fdct
src/fdct_module.cpp
Expand All @@ -21,3 +30,4 @@ pybind11_add_module(fdct

# Link the module with Python
target_link_libraries(fdct PRIVATE ${PYTHON_LIBRARIES})
target_link_libraries(fdct PRIVATE OpenMP::OpenMP_CXX)
6 changes: 5 additions & 1 deletion README.org
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ modern multicore processors.
:PROPERTIES:
:CUSTOM_ID: prospective-users
:END:
FDCP will be valuable in fieds where DCT is used extensively:
FDCT will be valuable in fieds where DCT is used extensively:

- Image processing and compression
- Audio processing and compression
Expand Down Expand Up @@ -136,3 +136,7 @@ multithread=True, simd=True)
[[https://coehuman.uodiyala.edu.iq/uploads/Coehuman%20library%20pdf/%D9%83%D8%AA%D8%A8%20%D8%A7%D9%84%D8%B1%D9%8A%D8%A7%D8%B6%D9%8A%D8%A7%D8%AA%20Mathematics%20books/Wavelets/25%20(2).pdf]]
3. C. W. Kok, Fast algorithm for computing discrete cosine transform
[[https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=558495]]
4. Nayuki, Fast discrete cosine transform algorithms
[[https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms]]
5. Nayuki, Free small FFT in multiple languages
[[https://www.nayuki.io/page/free-small-fft-in-multiple-languages]]
4 changes: 3 additions & 1 deletion ref/README.org
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@

This reference implementation is provided by Nayuki. I greatly appreciate their contribution.

For more details, please visit Nayuki's [[Website][https://www.nayuki.io/]]
For more details, please visit Nayuki's website
- https://www.nayuki.io/page/free-small-fft-in-multiple-languages
- https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
91 changes: 58 additions & 33 deletions src/complex_array.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#ifndef COMPLEX_ARRAY_HPP
#define COMPLEX_ARRAY_HPP
#pragma once
#include <immintrin.h>

#include <algorithm>
#include <complex>
#include <stdexcept>
Expand Down Expand Up @@ -101,37 +102,65 @@ class ComplexArray {
if (array_size != other.array_size) {
throw std::invalid_argument("Array sizes do not match");
}

for (size_t i = 0; i < array_size; i++) {
double temp_real =
real[i] * other.real[i] - imag[i] * other.imag[i];
double temp_imag =
real[i] * other.imag[i] + imag[i] * other.real[i];
real[i] = temp_real;
imag[i] = temp_imag;
if (simd) {
for (size_t i = 0; i <= array_size - 4; i += 4) {
__m256d real_vec = _mm256_loadu_pd(real + i);
__m256d imag_vec = _mm256_loadu_pd(imag + i);
__m256d other_real_vec = _mm256_loadu_pd(other.real + i);
__m256d other_imag_vec = _mm256_loadu_pd(other.imag + i);

__m256d temp_real =
_mm256_sub_pd(_mm256_mul_pd(real_vec, other_real_vec),
_mm256_mul_pd(imag_vec, other_imag_vec));
__m256d temp_imag =
_mm256_add_pd(_mm256_mul_pd(real_vec, other_imag_vec),
_mm256_mul_pd(imag_vec, other_real_vec));

_mm256_storeu_pd(real + i, temp_real);
_mm256_storeu_pd(imag + i, temp_imag);
}
for (size_t i = array_size - array_size % 4; i < array_size; i++) {
double temp_real =
real[i] * other.real[i] - imag[i] * other.imag[i];
double temp_imag =
real[i] * other.imag[i] + imag[i] * other.real[i];
real[i] = temp_real;
imag[i] = temp_imag;
}
} else {
for (size_t i = 0; i < array_size; i++) {
double temp_real =
real[i] * other.real[i] - imag[i] * other.imag[i];
double temp_imag =
real[i] * other.imag[i] + imag[i] * other.real[i];
real[i] = temp_real;
imag[i] = temp_imag;
}
}
}

// void partial_multiply(const ComplexArray &other, size_t start, size_t length) {

// size_t end = start + length;
// if (end > array_size || end > other.array_size) {
// throw std::invalid_argument("Array sizes do not match");
// }
// for (size_t i = 0; i < length; i++) {
// double temp_real =
// real[start + i] * other.real[i] - imag[start + i] * other.imag[i];
// double temp_imag =
// real[start + i] * other.imag[i] + imag[start + i] * other.real[i];
// real[start + i] = temp_real;
// imag[start + i] = temp_imag;
// }
// }

void scale(double scalar, bool simd = false) {
for (size_t i = 0; i < array_size; i++) {
real[i] *= scalar;
imag[i] *= scalar;
if (simd) {
__m256d scalar_vec = _mm256_set1_pd(scalar);
for (size_t i = 0; i <= array_size - 4; i += 4) {
__m256d real_vec = _mm256_loadu_pd(real + i);
__m256d imag_vec = _mm256_loadu_pd(imag + i);

real_vec = _mm256_mul_pd(real_vec, scalar_vec);
imag_vec = _mm256_mul_pd(imag_vec, scalar_vec);

_mm256_storeu_pd(real + i, real_vec);
_mm256_storeu_pd(imag + i, imag_vec);
}
for (size_t i = array_size - array_size % 4; i < array_size; i++) {
real[i] *= scalar;
imag[i] *= scalar;
}
} else {
for (size_t i = 0; i < array_size; i++) {
real[i] *= scalar;
imag[i] *= scalar;
}
}
}

Expand Down Expand Up @@ -165,8 +194,6 @@ class ComplexArray {
return ComplexProxy(real + index, imag + index);
}



size_t size() const { return array_size; }

double *get_real_ptr() { return real; }
Expand All @@ -177,5 +204,3 @@ class ComplexArray {

const double *get_imag_ptr() const { return imag; }
};

#endif
18 changes: 10 additions & 8 deletions src/dct_naive.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ namespace py = pybind11;
constexpr double SQRT2_1 = 0.7071067811865475;
constexpr double ONE_SUB_SQRT2_1 = 1 - 0.7071067811865475;

static inline double dct_naive_single(double *buf, size_t buf_size, size_t i, bool simd) {
static inline double dct_naive_single(double *buf, size_t buf_size, size_t i) {
double sum = 0.0f;
for (size_t j = 0; j < buf_size; j++) {
sum += buf[j] * cos(M_PI / buf_size * i * (j + 0.5));
Expand All @@ -28,42 +28,44 @@ static inline double idct_naive_single(double *buf, size_t buf_size, size_t i) {
return sum;
}

void dct_naive_inner(double *buf, size_t buf_size, double *out, bool simd) {
void dct_naive_inner(double *buf, size_t buf_size, double *out) {
#pragma omp parallel for
for (size_t i = 0; i < buf_size; i++) {
out[i] = 2.0 * dct_naive_single(buf, buf_size, i, simd);
out[i] = 2.0 * dct_naive_single(buf, buf_size, i);
}
}

void idct_naive_inner(double *buf, size_t buf_size, double *out, bool simd) {
void idct_naive_inner(double *buf, size_t buf_size, double *out) {
#pragma omp parallel for
for (size_t i = 0; i < buf_size; i++) {
out[i] = (buf[0] + (2 * idct_naive_single(buf, buf_size, i))) / (2.0 * buf_size);
}
}

py::array dct_naive(
py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
py::buffer_info buf_info = in_arr.request();
double *ptr = static_cast<double *>(buf_info.ptr);
size_t buf_size = buf_info.size;

py::array_t<double> out_arr(buf_info.shape);

double *out_buf = static_cast<double *>(out_arr.request(true).ptr);
dct_naive_inner(ptr, buf_size, out_buf, simd);
dct_naive_inner(ptr, buf_size, out_buf);

return out_arr;
}

py::array idct_naive(
py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
py::buffer_info buf_info = in_arr.request();
double *ptr = static_cast<double *>(buf_info.ptr);
size_t buf_size = buf_info.size;

py::array_t<double> out_arr(buf_info.shape);

double *out_buf = static_cast<double *>(out_arr.request(true).ptr);
idct_naive_inner(ptr, buf_size, out_buf, simd);
idct_naive_inner(ptr, buf_size, out_buf);

return out_arr;
}
Expand Down
56 changes: 8 additions & 48 deletions src/fdct.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ namespace py = pybind11;

// Wrapper function to accept and return NumPy arrays
py::array fdct(
py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
// Request a buffer from the input array
py::buffer_info buf = in_arr.request();
// Check if the input array is 1-dimensional or 2-dimensional
Expand All @@ -38,57 +38,16 @@ py::array fdct(
static_cast<double *>(buf.ptr) + N, input_data.get_real_ptr());
std::fill(input_data.get_imag_ptr(), input_data.get_imag_ptr() + N, 0.0);
// Perform FCT
double *result = new double[N];
fct_inner(input_data, result, simd);

py::array_t<double, py::array::c_style> result(N);
double *result_ptr = static_cast<double *>(result.request().ptr);
fct_inner(input_data, result_ptr);
// Convert result back to NumPy array
return py::array_t<double>(N, result);
}

std::vector<double> idct_inner(const std::vector<double> &c) {
size_t N = c.size();
std::vector<double> x(N);
// auto u = reorderOutput(c);
ComplexArray u(N);
// Allocate FFTW input/output arrays
fftw_complex *fft_in =
(fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
fftw_complex *fft_out =
(fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);

// Initialize FFTW plan
fftw_plan plan =
fftw_plan_dft_1d(N, fft_in, fft_out, FFTW_BACKWARD, FFTW_ESTIMATE);

// Fill FFTW input array
for (size_t n = 0; n < N; ++n) {
fft_in[n][0] = c[n]; // Real part
fft_in[n][1] = 0.0; // Imaginary part
}

// Execute IFFT
fftw_execute(plan);

// Compute the IDCT using the IFFT output
for (size_t k = 0; k < N; ++k) {
double real = fft_out[k][0];
double imag = fft_out[k][1];
x[k] = (real * cos(M_PI * k / (2.0 * N)) +
imag * sin(M_PI * k / (2.0 * N))) /
N;
}

// Free FFTW resources
fftw_destroy_plan(plan);
fftw_free(fft_in);
fftw_free(fft_out);

return x;
return result;
}

// Wrapper function to accept and return NumPy arrays
py::array ifdct(
py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
// Request a buffer from the input array
py::buffer_info buf = in_arr.request();
if (buf.ndim != 1) {
Expand All @@ -105,7 +64,8 @@ py::array ifdct(

// Perform IDCT
double *result = new double[N];
ifdct_inner(input_data, result, simd);
ifdct_inner(input_data, result);
// std::fill(result, result + N, 0.0);
// Convert result back to NumPy array
return py::array_t<double>(N, result);
}
6 changes: 4 additions & 2 deletions src/fdct_inner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

// Function to compute the Fast Cosine Transform


ComplexArray reorderInput(const ComplexArray &x) {
size_t N = x.size();
ComplexArray u(N);
Expand Down Expand Up @@ -74,7 +75,7 @@ void idctPreprocess(ComplexArray &x) {
}


void fct_inner(const ComplexArray &x, double *result, bool simd) {
void fct_inner(const ComplexArray &x, double *result) {
size_t N = x.size();
auto fft_in = reorderInput(x);

Expand All @@ -84,14 +85,15 @@ void fct_inner(const ComplexArray &x, double *result, bool simd) {
// Compute the DCT using the FFT output
double *x_real = fft_in.get_real_ptr();
double *x_imag = fft_in.get_imag_ptr();
#pragma omp parallel for
for (size_t k = 0; k < N; ++k) {
result[k] =
(x_real[k] * cos(M_PI * k / (2 * N)) - x_imag[k] * sin(-M_PI * k / (2 * N))) *
2;
}
}

void ifdct_inner(const ComplexArray &c, double *result, bool simd) {
void ifdct_inner(const ComplexArray &c, double *result) {
size_t N = c.size();
ComplexArray fft_in = c;
idctPreprocess(fft_in);
Expand Down
4 changes: 2 additions & 2 deletions src/fdct_inner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@

ComplexArray reorderInput(const ComplexArray &x);

void fct_inner(const ComplexArray &x, double *result, bool simd);
void fct_inner(const ComplexArray &x, double *result);

void ifdct_inner(const ComplexArray &c, double *result, bool simd);
void ifdct_inner(const ComplexArray &c, double *result);
11 changes: 11 additions & 0 deletions src/fdct_module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,25 @@
#include <pybind11/stl.h>
#include "fdct.hpp"
#include "dct_naive.hpp"
#include "complex_array.hpp"

namespace py = pybind11;
#define _USE_MATH_DEFINES
#include <cmath>

void _set_num_thread(size_t n) {
fft::set_num_threads(n);
}

void _set_simd(bool s) {
fft::set_simd(s);
}

PYBIND11_MODULE(fdct, m) {
m.def("dct_naive", &dct_naive, "Compute fast discrete cosine transformation.");
m.def("idct_naive", &idct_naive, "Compute inverse fast discrete cosine transformation.");
m.def("fdct", &fdct, "Compute fast discrete cosine transformation.");
m.def("ifdct", &ifdct, "Compute inverse fast discrete cosine transformation.");
m.def("set_num_threads", &_set_num_thread, "Set the number of threads to use.");
m.def("set_simd", _set_simd, "Set whether to use SIMD instructions.");
}
Loading