cfmc30 · cfmc30 · Dec 15, 2024 · Dec 9, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,17 +1,26 @@
 cmake_minimum_required(VERSION 3.5)
 project(fdct)
 
+add_compile_options(-Wall)
+
+# AVX2
+add_compile_options(-mavx2)
+
+# OpenMP
+find_package(OpenMP REQUIRED)
+add_compile_options(${OpenMP_CXX_FLAGS})
+
 # Find Python
 find_package(PythonLibs REQUIRED)
 include_directories(${PYTHON_INCLUDE_DIRS})
 
 # Find Pybind11
 find_package(pybind11 REQUIRED)
 
-find_package(PkgConfig REQUIRED)
-pkg_search_module(FFTW REQUIRED fftw3 IMPORTED_TARGET)
-include_directories(PkgConfig::FFTW)
-link_libraries(PkgConfig::FFTW)
+# find_package(PkgConfig REQUIRED)
+# pkg_search_module(FFTW REQUIRED fftw3 IMPORTED_TARGET)
+# include_directories(PkgConfig::FFTW)
+# link_libraries(PkgConfig::FFTW)
 
 pybind11_add_module(fdct 
     src/fdct_module.cpp
@@ -21,3 +30,4 @@ pybind11_add_module(fdct
 
 # Link the module with Python
 target_link_libraries(fdct PRIVATE ${PYTHON_LIBRARIES})
+target_link_libraries(fdct PRIVATE OpenMP::OpenMP_CXX)
diff --git a/README.org b/README.org
@@ -43,7 +43,7 @@ modern multicore processors.
 :PROPERTIES:
 :CUSTOM_ID: prospective-users
 :END:
-FDCP will be valuable in fieds where DCT is used extensively:
+FDCT will be valuable in fieds where DCT is used extensively:
 
 - Image processing and compression
 - Audio processing and compression
@@ -136,3 +136,7 @@ multithread=True, simd=True)
    [[https://coehuman.uodiyala.edu.iq/uploads/Coehuman%20library%20pdf/%D9%83%D8%AA%D8%A8%20%D8%A7%D9%84%D8%B1%D9%8A%D8%A7%D8%B6%D9%8A%D8%A7%D8%AA%20Mathematics%20books/Wavelets/25%20(2).pdf]]
 3. C. W. Kok, Fast algorithm for computing discrete cosine transform
    [[https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=558495]]
+4. Nayuki, Fast discrete cosine transform algorithms
+   [[https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms]]
+5. Nayuki, Free small FFT in multiple languages
+   [[https://www.nayuki.io/page/free-small-fft-in-multiple-languages]]
diff --git a/ref/README.org b/ref/README.org
@@ -2,4 +2,6 @@
 
 This reference implementation is provided by Nayuki. I greatly appreciate their contribution.
 
-For more details, please visit Nayuki's [[Website][https://www.nayuki.io/]]
+For more details, please visit Nayuki's website
+- https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+- https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
diff --git a/src/complex_array.hpp b/src/complex_array.hpp
@@ -1,5 +1,6 @@
-#ifndef COMPLEX_ARRAY_HPP
-#define COMPLEX_ARRAY_HPP
+#pragma once
+#include <immintrin.h>
+
 #include <algorithm>
 #include <complex>
 #include <stdexcept>
@@ -101,37 +102,65 @@ class ComplexArray {
         if (array_size != other.array_size) {
             throw std::invalid_argument("Array sizes do not match");
         }
-
-        for (size_t i = 0; i < array_size; i++) {
-            double temp_real =
-                real[i] * other.real[i] - imag[i] * other.imag[i];
-            double temp_imag =
-                real[i] * other.imag[i] + imag[i] * other.real[i];
-            real[i] = temp_real;
-            imag[i] = temp_imag;
+        if (simd) {
+            for (size_t i = 0; i <= array_size - 4; i += 4) {
+                __m256d real_vec = _mm256_loadu_pd(real + i);
+                __m256d imag_vec = _mm256_loadu_pd(imag + i);
+                __m256d other_real_vec = _mm256_loadu_pd(other.real + i);
+                __m256d other_imag_vec = _mm256_loadu_pd(other.imag + i);
+
+                __m256d temp_real =
+                    _mm256_sub_pd(_mm256_mul_pd(real_vec, other_real_vec),
+                                  _mm256_mul_pd(imag_vec, other_imag_vec));
+                __m256d temp_imag =
+                    _mm256_add_pd(_mm256_mul_pd(real_vec, other_imag_vec),
+                                  _mm256_mul_pd(imag_vec, other_real_vec));
+
+                _mm256_storeu_pd(real + i, temp_real);
+                _mm256_storeu_pd(imag + i, temp_imag);
+            }
+            for (size_t i = array_size - array_size % 4; i < array_size; i++) {
+                double temp_real =
+                    real[i] * other.real[i] - imag[i] * other.imag[i];
+                double temp_imag =
+                    real[i] * other.imag[i] + imag[i] * other.real[i];
+                real[i] = temp_real;
+                imag[i] = temp_imag;
+            }
+        } else {
+            for (size_t i = 0; i < array_size; i++) {
+                double temp_real =
+                    real[i] * other.real[i] - imag[i] * other.imag[i];
+                double temp_imag =
+                    real[i] * other.imag[i] + imag[i] * other.real[i];
+                real[i] = temp_real;
+                imag[i] = temp_imag;
+            }
         }
     }
 
-    // void partial_multiply(const ComplexArray &other, size_t start, size_t length) {
-
-    //     size_t end = start + length;
-    //     if (end > array_size || end > other.array_size) {
-    //         throw std::invalid_argument("Array sizes do not match");
-    //     }
-    //     for (size_t i = 0; i < length; i++) {
-    //         double temp_real =
-    //             real[start + i] * other.real[i] - imag[start + i] * other.imag[i];
-    //         double temp_imag =
-    //             real[start + i] * other.imag[i] + imag[start + i] * other.real[i];
-    //         real[start + i] = temp_real;
-    //         imag[start + i] = temp_imag;
-    //     }
-    // }
-
     void scale(double scalar, bool simd = false) {
-        for (size_t i = 0; i < array_size; i++) {
-            real[i] *= scalar;
-            imag[i] *= scalar;
+        if (simd) {
+            __m256d scalar_vec = _mm256_set1_pd(scalar);
+            for (size_t i = 0; i <= array_size - 4; i += 4) {
+                __m256d real_vec = _mm256_loadu_pd(real + i);
+                __m256d imag_vec = _mm256_loadu_pd(imag + i);
+
+                real_vec = _mm256_mul_pd(real_vec, scalar_vec);
+                imag_vec = _mm256_mul_pd(imag_vec, scalar_vec);
+
+                _mm256_storeu_pd(real + i, real_vec);
+                _mm256_storeu_pd(imag + i, imag_vec);
+            }
+            for (size_t i = array_size - array_size % 4; i < array_size; i++) {
+                real[i] *= scalar;
+                imag[i] *= scalar;
+            }
+        } else {
+            for (size_t i = 0; i < array_size; i++) {
+                real[i] *= scalar;
+                imag[i] *= scalar;
+            }
         }
     }
 
@@ -165,8 +194,6 @@ class ComplexArray {
         return ComplexProxy(real + index, imag + index);
     }
 
-
-
     size_t size() const { return array_size; }
 
     double *get_real_ptr() { return real; }
@@ -177,5 +204,3 @@ class ComplexArray {
 
     const double *get_imag_ptr() const { return imag; }
 };
-
-#endif
diff --git a/src/dct_naive.hpp b/src/dct_naive.hpp
@@ -12,7 +12,7 @@ namespace py = pybind11;
 constexpr double SQRT2_1 = 0.7071067811865475;
 constexpr double ONE_SUB_SQRT2_1 = 1 - 0.7071067811865475;
 
-static inline double dct_naive_single(double *buf, size_t buf_size, size_t i, bool simd) {
+static inline double dct_naive_single(double *buf, size_t buf_size, size_t i) {
   double sum = 0.0f;
   for (size_t j = 0; j < buf_size; j++) {
     sum += buf[j] * cos(M_PI / buf_size * i * (j + 0.5));
@@ -28,42 +28,44 @@ static inline double idct_naive_single(double *buf, size_t buf_size, size_t i) {
   return sum;
 }
 
-void dct_naive_inner(double *buf, size_t buf_size, double *out, bool simd) {
+void dct_naive_inner(double *buf, size_t buf_size, double *out) {
+#pragma omp parallel for
   for (size_t i = 0; i < buf_size; i++) {
-    out[i] = 2.0 * dct_naive_single(buf, buf_size, i, simd);
+    out[i] = 2.0 * dct_naive_single(buf, buf_size, i);
   }
 }
 
-void idct_naive_inner(double *buf, size_t buf_size, double *out, bool simd) {
+void idct_naive_inner(double *buf, size_t buf_size, double *out) {
+#pragma omp parallel for
   for (size_t i = 0; i < buf_size; i++) {
     out[i] = (buf[0] + (2 * idct_naive_single(buf, buf_size, i))) / (2.0 * buf_size);
   }
 }
 
 py::array dct_naive(
-  py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
+  py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
   py::buffer_info buf_info = in_arr.request();
   double *ptr = static_cast<double *>(buf_info.ptr);
   size_t buf_size = buf_info.size;
 
   py::array_t<double> out_arr(buf_info.shape);
 
   double *out_buf = static_cast<double *>(out_arr.request(true).ptr);
-  dct_naive_inner(ptr, buf_size, out_buf, simd);
+  dct_naive_inner(ptr, buf_size, out_buf);
 
   return out_arr;
 }
 
 py::array idct_naive(
-  py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
+  py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
   py::buffer_info buf_info = in_arr.request();
   double *ptr = static_cast<double *>(buf_info.ptr);
   size_t buf_size = buf_info.size;
 
   py::array_t<double> out_arr(buf_info.shape);
 
   double *out_buf = static_cast<double *>(out_arr.request(true).ptr);
-  idct_naive_inner(ptr, buf_size, out_buf, simd);
+  idct_naive_inner(ptr, buf_size, out_buf);
 
   return out_arr;
 }

diff --git a/src/fdct.hpp b/src/fdct.hpp
@@ -21,7 +21,7 @@ namespace py = pybind11;
 
 // Wrapper function to accept and return NumPy arrays
 py::array fdct(
-    py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
+    py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
     // Request a buffer from the input array
     py::buffer_info buf = in_arr.request();
     // Check if the input array is 1-dimensional or 2-dimensional
@@ -38,57 +38,16 @@ py::array fdct(
               static_cast<double *>(buf.ptr) + N, input_data.get_real_ptr());
     std::fill(input_data.get_imag_ptr(), input_data.get_imag_ptr() + N, 0.0);
     // Perform FCT
-    double *result = new double[N];
-    fct_inner(input_data, result, simd);
-
+    py::array_t<double, py::array::c_style> result(N);
+    double *result_ptr = static_cast<double *>(result.request().ptr);
+    fct_inner(input_data, result_ptr);
     // Convert result back to NumPy array
-    return py::array_t<double>(N, result);
-}
-
-std::vector<double> idct_inner(const std::vector<double> &c) {
-    size_t N = c.size();
-    std::vector<double> x(N);
-    // auto u = reorderOutput(c);
-    ComplexArray u(N);
-    // Allocate FFTW input/output arrays
-    fftw_complex *fft_in =
-        (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
-    fftw_complex *fft_out =
-        (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
-
-    // Initialize FFTW plan
-    fftw_plan plan =
-        fftw_plan_dft_1d(N, fft_in, fft_out, FFTW_BACKWARD, FFTW_ESTIMATE);
-
-    // Fill FFTW input array
-    for (size_t n = 0; n < N; ++n) {
-        fft_in[n][0] = c[n];  // Real part
-        fft_in[n][1] = 0.0;   // Imaginary part
-    }
-
-    // Execute IFFT
-    fftw_execute(plan);
-
-    // Compute the IDCT using the IFFT output
-    for (size_t k = 0; k < N; ++k) {
-        double real = fft_out[k][0];
-        double imag = fft_out[k][1];
-        x[k] = (real * cos(M_PI * k / (2.0 * N)) +
-                imag * sin(M_PI * k / (2.0 * N))) /
-               N;
-    }
-
-    // Free FFTW resources
-    fftw_destroy_plan(plan);
-    fftw_free(fft_in);
-    fftw_free(fft_out);
-
-    return x;
+    return result;
 }
 
 // Wrapper function to accept and return NumPy arrays
 py::array ifdct(
-    py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
+    py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
     // Request a buffer from the input array
     py::buffer_info buf = in_arr.request();
     if (buf.ndim != 1) {
@@ -105,7 +64,8 @@ py::array ifdct(
 
     // Perform IDCT
     double *result = new double[N];
-    ifdct_inner(input_data, result, simd);
+    ifdct_inner(input_data, result);
+    // std::fill(result, result + N, 0.0);
     // Convert result back to NumPy array
     return py::array_t<double>(N, result);
 }
diff --git a/src/fdct_inner.cpp b/src/fdct_inner.cpp
@@ -13,6 +13,7 @@
 
 // Function to compute the Fast Cosine Transform
 
+
 ComplexArray reorderInput(const ComplexArray &x) {
     size_t N = x.size();
     ComplexArray u(N);
@@ -74,7 +75,7 @@ void idctPreprocess(ComplexArray &x) {
 }
 
 
-void fct_inner(const ComplexArray &x, double *result, bool simd) {
+void fct_inner(const ComplexArray &x, double *result) {
     size_t N = x.size();
     auto fft_in = reorderInput(x);
 
@@ -84,14 +85,15 @@ void fct_inner(const ComplexArray &x, double *result, bool simd) {
     // Compute the DCT using the FFT output
     double *x_real = fft_in.get_real_ptr();
     double *x_imag = fft_in.get_imag_ptr();
+#pragma omp parallel for
     for (size_t k = 0; k < N; ++k) {
         result[k] =
             (x_real[k] * cos(M_PI * k / (2 * N)) - x_imag[k] * sin(-M_PI * k / (2 * N))) *
             2;
     }
 }
 
-void ifdct_inner(const ComplexArray &c, double *result, bool simd) {
+void ifdct_inner(const ComplexArray &c, double *result) {
     size_t N = c.size();
     ComplexArray fft_in = c;
     idctPreprocess(fft_in);

diff --git a/src/fdct_inner.hpp b/src/fdct_inner.hpp
@@ -12,6 +12,6 @@
 
 ComplexArray reorderInput(const ComplexArray &x);
 
-void fct_inner(const ComplexArray &x, double *result, bool simd);
+void fct_inner(const ComplexArray &x, double *result);
 
-void ifdct_inner(const ComplexArray &c, double *result, bool simd);
+void ifdct_inner(const ComplexArray &c, double *result);
diff --git a/src/fdct_module.cpp b/src/fdct_module.cpp
@@ -3,14 +3,25 @@
 #include <pybind11/stl.h>
 #include "fdct.hpp"
 #include "dct_naive.hpp"
+#include "complex_array.hpp"
 
 namespace py = pybind11;
 #define _USE_MATH_DEFINES
 #include <cmath>
 
+void _set_num_thread(size_t n) {
+  fft::set_num_threads(n);
+}
+
+void _set_simd(bool s) {
+  fft::set_simd(s);
+}
+
 PYBIND11_MODULE(fdct, m) {
   m.def("dct_naive", &dct_naive, "Compute fast discrete cosine transformation.");
   m.def("idct_naive", &idct_naive, "Compute inverse fast discrete cosine transformation.");
   m.def("fdct", &fdct, "Compute fast discrete cosine transformation.");
   m.def("ifdct", &ifdct, "Compute inverse fast discrete cosine transformation.");
+  m.def("set_num_threads", &_set_num_thread, "Set the number of threads to use.");
+  m.def("set_simd", _set_simd, "Set whether to use SIMD instructions.");
 }