diff --git a/CMakeLists.txt b/CMakeLists.txt index 4afb28a..de69a9a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,15 @@ cmake_minimum_required(VERSION 3.5) project(fdct) +add_compile_options(-Wall) + +# AVX2 +add_compile_options(-mavx2) + +# OpenMP +find_package(OpenMP REQUIRED) +add_compile_options(${OpenMP_CXX_FLAGS}) + # Find Python find_package(PythonLibs REQUIRED) include_directories(${PYTHON_INCLUDE_DIRS}) @@ -8,10 +17,10 @@ include_directories(${PYTHON_INCLUDE_DIRS}) # Find Pybind11 find_package(pybind11 REQUIRED) -find_package(PkgConfig REQUIRED) -pkg_search_module(FFTW REQUIRED fftw3 IMPORTED_TARGET) -include_directories(PkgConfig::FFTW) -link_libraries(PkgConfig::FFTW) +# find_package(PkgConfig REQUIRED) +# pkg_search_module(FFTW REQUIRED fftw3 IMPORTED_TARGET) +# include_directories(PkgConfig::FFTW) +# link_libraries(PkgConfig::FFTW) pybind11_add_module(fdct src/fdct_module.cpp @@ -21,3 +30,4 @@ pybind11_add_module(fdct # Link the module with Python target_link_libraries(fdct PRIVATE ${PYTHON_LIBRARIES}) +target_link_libraries(fdct PRIVATE OpenMP::OpenMP_CXX) diff --git a/README.org b/README.org index e1782c1..6ed7575 100644 --- a/README.org +++ b/README.org @@ -43,7 +43,7 @@ modern multicore processors. :PROPERTIES: :CUSTOM_ID: prospective-users :END: -FDCP will be valuable in fieds where DCT is used extensively: +FDCT will be valuable in fieds where DCT is used extensively: - Image processing and compression - Audio processing and compression @@ -136,3 +136,7 @@ multithread=True, simd=True) [[https://coehuman.uodiyala.edu.iq/uploads/Coehuman%20library%20pdf/%D9%83%D8%AA%D8%A8%20%D8%A7%D9%84%D8%B1%D9%8A%D8%A7%D8%B6%D9%8A%D8%A7%D8%AA%20Mathematics%20books/Wavelets/25%20(2).pdf]] 3. C. W. Kok, Fast algorithm for computing discrete cosine transform [[https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=558495]] +4. Nayuki, Fast discrete cosine transform algorithms + [[https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms]] +5. Nayuki, Free small FFT in multiple languages + [[https://www.nayuki.io/page/free-small-fft-in-multiple-languages]] diff --git a/ref/README.org b/ref/README.org index 74fd6b9..2befb38 100644 --- a/ref/README.org +++ b/ref/README.org @@ -2,4 +2,6 @@ This reference implementation is provided by Nayuki. I greatly appreciate their contribution. -For more details, please visit Nayuki's [[Website][https://www.nayuki.io/]] +For more details, please visit Nayuki's website +- https://www.nayuki.io/page/free-small-fft-in-multiple-languages +- https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms diff --git a/src/complex_array.hpp b/src/complex_array.hpp index 911d8fa..e199c5b 100644 --- a/src/complex_array.hpp +++ b/src/complex_array.hpp @@ -1,5 +1,6 @@ -#ifndef COMPLEX_ARRAY_HPP -#define COMPLEX_ARRAY_HPP +#pragma once +#include + #include #include #include @@ -101,37 +102,65 @@ class ComplexArray { if (array_size != other.array_size) { throw std::invalid_argument("Array sizes do not match"); } - - for (size_t i = 0; i < array_size; i++) { - double temp_real = - real[i] * other.real[i] - imag[i] * other.imag[i]; - double temp_imag = - real[i] * other.imag[i] + imag[i] * other.real[i]; - real[i] = temp_real; - imag[i] = temp_imag; + if (simd) { + for (size_t i = 0; i <= array_size - 4; i += 4) { + __m256d real_vec = _mm256_loadu_pd(real + i); + __m256d imag_vec = _mm256_loadu_pd(imag + i); + __m256d other_real_vec = _mm256_loadu_pd(other.real + i); + __m256d other_imag_vec = _mm256_loadu_pd(other.imag + i); + + __m256d temp_real = + _mm256_sub_pd(_mm256_mul_pd(real_vec, other_real_vec), + _mm256_mul_pd(imag_vec, other_imag_vec)); + __m256d temp_imag = + _mm256_add_pd(_mm256_mul_pd(real_vec, other_imag_vec), + _mm256_mul_pd(imag_vec, other_real_vec)); + + _mm256_storeu_pd(real + i, temp_real); + _mm256_storeu_pd(imag + i, temp_imag); + } + for (size_t i = array_size - array_size % 4; i < array_size; i++) { + double temp_real = + real[i] * other.real[i] - imag[i] * other.imag[i]; + double temp_imag = + real[i] * other.imag[i] + imag[i] * other.real[i]; + real[i] = temp_real; + imag[i] = temp_imag; + } + } else { + for (size_t i = 0; i < array_size; i++) { + double temp_real = + real[i] * other.real[i] - imag[i] * other.imag[i]; + double temp_imag = + real[i] * other.imag[i] + imag[i] * other.real[i]; + real[i] = temp_real; + imag[i] = temp_imag; + } } } - // void partial_multiply(const ComplexArray &other, size_t start, size_t length) { - - // size_t end = start + length; - // if (end > array_size || end > other.array_size) { - // throw std::invalid_argument("Array sizes do not match"); - // } - // for (size_t i = 0; i < length; i++) { - // double temp_real = - // real[start + i] * other.real[i] - imag[start + i] * other.imag[i]; - // double temp_imag = - // real[start + i] * other.imag[i] + imag[start + i] * other.real[i]; - // real[start + i] = temp_real; - // imag[start + i] = temp_imag; - // } - // } - void scale(double scalar, bool simd = false) { - for (size_t i = 0; i < array_size; i++) { - real[i] *= scalar; - imag[i] *= scalar; + if (simd) { + __m256d scalar_vec = _mm256_set1_pd(scalar); + for (size_t i = 0; i <= array_size - 4; i += 4) { + __m256d real_vec = _mm256_loadu_pd(real + i); + __m256d imag_vec = _mm256_loadu_pd(imag + i); + + real_vec = _mm256_mul_pd(real_vec, scalar_vec); + imag_vec = _mm256_mul_pd(imag_vec, scalar_vec); + + _mm256_storeu_pd(real + i, real_vec); + _mm256_storeu_pd(imag + i, imag_vec); + } + for (size_t i = array_size - array_size % 4; i < array_size; i++) { + real[i] *= scalar; + imag[i] *= scalar; + } + } else { + for (size_t i = 0; i < array_size; i++) { + real[i] *= scalar; + imag[i] *= scalar; + } } } @@ -165,8 +194,6 @@ class ComplexArray { return ComplexProxy(real + index, imag + index); } - - size_t size() const { return array_size; } double *get_real_ptr() { return real; } @@ -177,5 +204,3 @@ class ComplexArray { const double *get_imag_ptr() const { return imag; } }; - -#endif diff --git a/src/dct_naive.hpp b/src/dct_naive.hpp index 07e19fe..bb68728 100644 --- a/src/dct_naive.hpp +++ b/src/dct_naive.hpp @@ -12,7 +12,7 @@ namespace py = pybind11; constexpr double SQRT2_1 = 0.7071067811865475; constexpr double ONE_SUB_SQRT2_1 = 1 - 0.7071067811865475; -static inline double dct_naive_single(double *buf, size_t buf_size, size_t i, bool simd) { +static inline double dct_naive_single(double *buf, size_t buf_size, size_t i) { double sum = 0.0f; for (size_t j = 0; j < buf_size; j++) { sum += buf[j] * cos(M_PI / buf_size * i * (j + 0.5)); @@ -28,20 +28,22 @@ static inline double idct_naive_single(double *buf, size_t buf_size, size_t i) { return sum; } -void dct_naive_inner(double *buf, size_t buf_size, double *out, bool simd) { +void dct_naive_inner(double *buf, size_t buf_size, double *out) { +#pragma omp parallel for for (size_t i = 0; i < buf_size; i++) { - out[i] = 2.0 * dct_naive_single(buf, buf_size, i, simd); + out[i] = 2.0 * dct_naive_single(buf, buf_size, i); } } -void idct_naive_inner(double *buf, size_t buf_size, double *out, bool simd) { +void idct_naive_inner(double *buf, size_t buf_size, double *out) { +#pragma omp parallel for for (size_t i = 0; i < buf_size; i++) { out[i] = (buf[0] + (2 * idct_naive_single(buf, buf_size, i))) / (2.0 * buf_size); } } py::array dct_naive( - py::array_t in_arr, bool simd) { + py::array_t in_arr) { py::buffer_info buf_info = in_arr.request(); double *ptr = static_cast(buf_info.ptr); size_t buf_size = buf_info.size; @@ -49,13 +51,13 @@ py::array dct_naive( py::array_t out_arr(buf_info.shape); double *out_buf = static_cast(out_arr.request(true).ptr); - dct_naive_inner(ptr, buf_size, out_buf, simd); + dct_naive_inner(ptr, buf_size, out_buf); return out_arr; } py::array idct_naive( - py::array_t in_arr, bool simd) { + py::array_t in_arr) { py::buffer_info buf_info = in_arr.request(); double *ptr = static_cast(buf_info.ptr); size_t buf_size = buf_info.size; @@ -63,7 +65,7 @@ py::array idct_naive( py::array_t out_arr(buf_info.shape); double *out_buf = static_cast(out_arr.request(true).ptr); - idct_naive_inner(ptr, buf_size, out_buf, simd); + idct_naive_inner(ptr, buf_size, out_buf); return out_arr; } diff --git a/src/fdct.hpp b/src/fdct.hpp index 476d7fc..64b8fdf 100644 --- a/src/fdct.hpp +++ b/src/fdct.hpp @@ -21,7 +21,7 @@ namespace py = pybind11; // Wrapper function to accept and return NumPy arrays py::array fdct( - py::array_t in_arr, bool simd) { + py::array_t in_arr) { // Request a buffer from the input array py::buffer_info buf = in_arr.request(); // Check if the input array is 1-dimensional or 2-dimensional @@ -38,57 +38,16 @@ py::array fdct( static_cast(buf.ptr) + N, input_data.get_real_ptr()); std::fill(input_data.get_imag_ptr(), input_data.get_imag_ptr() + N, 0.0); // Perform FCT - double *result = new double[N]; - fct_inner(input_data, result, simd); - + py::array_t result(N); + double *result_ptr = static_cast(result.request().ptr); + fct_inner(input_data, result_ptr); // Convert result back to NumPy array - return py::array_t(N, result); -} - -std::vector idct_inner(const std::vector &c) { - size_t N = c.size(); - std::vector x(N); - // auto u = reorderOutput(c); - ComplexArray u(N); - // Allocate FFTW input/output arrays - fftw_complex *fft_in = - (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N); - fftw_complex *fft_out = - (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N); - - // Initialize FFTW plan - fftw_plan plan = - fftw_plan_dft_1d(N, fft_in, fft_out, FFTW_BACKWARD, FFTW_ESTIMATE); - - // Fill FFTW input array - for (size_t n = 0; n < N; ++n) { - fft_in[n][0] = c[n]; // Real part - fft_in[n][1] = 0.0; // Imaginary part - } - - // Execute IFFT - fftw_execute(plan); - - // Compute the IDCT using the IFFT output - for (size_t k = 0; k < N; ++k) { - double real = fft_out[k][0]; - double imag = fft_out[k][1]; - x[k] = (real * cos(M_PI * k / (2.0 * N)) + - imag * sin(M_PI * k / (2.0 * N))) / - N; - } - - // Free FFTW resources - fftw_destroy_plan(plan); - fftw_free(fft_in); - fftw_free(fft_out); - - return x; + return result; } // Wrapper function to accept and return NumPy arrays py::array ifdct( - py::array_t in_arr, bool simd) { + py::array_t in_arr) { // Request a buffer from the input array py::buffer_info buf = in_arr.request(); if (buf.ndim != 1) { @@ -105,7 +64,8 @@ py::array ifdct( // Perform IDCT double *result = new double[N]; - ifdct_inner(input_data, result, simd); + ifdct_inner(input_data, result); + // std::fill(result, result + N, 0.0); // Convert result back to NumPy array return py::array_t(N, result); } diff --git a/src/fdct_inner.cpp b/src/fdct_inner.cpp index 4b717dd..71d3ac4 100644 --- a/src/fdct_inner.cpp +++ b/src/fdct_inner.cpp @@ -13,6 +13,7 @@ // Function to compute the Fast Cosine Transform + ComplexArray reorderInput(const ComplexArray &x) { size_t N = x.size(); ComplexArray u(N); @@ -74,7 +75,7 @@ void idctPreprocess(ComplexArray &x) { } -void fct_inner(const ComplexArray &x, double *result, bool simd) { +void fct_inner(const ComplexArray &x, double *result) { size_t N = x.size(); auto fft_in = reorderInput(x); @@ -84,6 +85,7 @@ void fct_inner(const ComplexArray &x, double *result, bool simd) { // Compute the DCT using the FFT output double *x_real = fft_in.get_real_ptr(); double *x_imag = fft_in.get_imag_ptr(); +#pragma omp parallel for for (size_t k = 0; k < N; ++k) { result[k] = (x_real[k] * cos(M_PI * k / (2 * N)) - x_imag[k] * sin(-M_PI * k / (2 * N))) * @@ -91,7 +93,7 @@ void fct_inner(const ComplexArray &x, double *result, bool simd) { } } -void ifdct_inner(const ComplexArray &c, double *result, bool simd) { +void ifdct_inner(const ComplexArray &c, double *result) { size_t N = c.size(); ComplexArray fft_in = c; idctPreprocess(fft_in); diff --git a/src/fdct_inner.hpp b/src/fdct_inner.hpp index 38dd6a3..cdbcd36 100644 --- a/src/fdct_inner.hpp +++ b/src/fdct_inner.hpp @@ -12,6 +12,6 @@ ComplexArray reorderInput(const ComplexArray &x); -void fct_inner(const ComplexArray &x, double *result, bool simd); +void fct_inner(const ComplexArray &x, double *result); -void ifdct_inner(const ComplexArray &c, double *result, bool simd); +void ifdct_inner(const ComplexArray &c, double *result); diff --git a/src/fdct_module.cpp b/src/fdct_module.cpp index 38c5d1e..fa6b56d 100644 --- a/src/fdct_module.cpp +++ b/src/fdct_module.cpp @@ -3,14 +3,25 @@ #include #include "fdct.hpp" #include "dct_naive.hpp" +#include "complex_array.hpp" namespace py = pybind11; #define _USE_MATH_DEFINES #include +void _set_num_thread(size_t n) { + fft::set_num_threads(n); +} + +void _set_simd(bool s) { + fft::set_simd(s); +} + PYBIND11_MODULE(fdct, m) { m.def("dct_naive", &dct_naive, "Compute fast discrete cosine transformation."); m.def("idct_naive", &idct_naive, "Compute inverse fast discrete cosine transformation."); m.def("fdct", &fdct, "Compute fast discrete cosine transformation."); m.def("ifdct", &ifdct, "Compute inverse fast discrete cosine transformation."); + m.def("set_num_threads", &_set_num_thread, "Set the number of threads to use."); + m.def("set_simd", _set_simd, "Set whether to use SIMD instructions."); } diff --git a/src/fft.cpp b/src/fft.cpp index 0e8bfb0..bf6f847 100644 --- a/src/fft.cpp +++ b/src/fft.cpp @@ -1,3 +1,10 @@ +#include "fft.hpp" + +#include +#include +#include +#include + #include #include #include @@ -5,152 +12,255 @@ #include #include #include -#include #include "complex_array.hpp" -#include "fft.hpp" static size_t reverseBits(size_t val, int width); +void fft::set_num_threads(size_t n) { omp_set_num_threads(n); } + +void fft::set_simd(bool s) { simd = s; } + static ComplexArray mkExpTable(size_t n, bool inverse) { - ComplexArray expTable(n); - double *real = expTable.get_real_ptr(); - double *imag = expTable.get_imag_ptr(); - for (size_t i = 0; i < n; i++) { - double angle = (inverse ? 2 : -2) * M_PI * i / n; - real[i] = std::cos(angle); - imag[i] = std::sin(angle); - } - return expTable; + ComplexArray expTable(n); + double *real = expTable.get_real_ptr(); + double *imag = expTable.get_imag_ptr(); + for (size_t i = 0; i < n; i++) { + double angle = (inverse ? 2 : -2) * M_PI * i / n; + real[i] = std::cos(angle); + imag[i] = std::sin(angle); + } + return expTable; } -void fft::transformRadix2(ComplexArray &x, bool inverse, bool simd) { - int levels = 0; - size_t n = x.size(); - double *real = x.get_real_ptr(); - double *imag = x.get_imag_ptr(); - - for (size_t temp = n; temp > 1U; temp >>= 1) - levels++; - if (static_cast(1U) << levels != n) - throw std::domain_error("Length is not a power of 2"); - - ComplexArray expTable = mkExpTable(n, inverse); - - - for (size_t i = 0; i < n; i++) { - size_t j = reverseBits(i, levels); - if (j > i) { - std::swap(real[i], real[j]); - std::swap(imag[i], imag[j]); +void fft::transformRadix2(ComplexArray &x, bool inverse) { + int levels = 0; + size_t n = x.size(); + double *real = x.get_real_ptr(); + double *imag = x.get_imag_ptr(); + + for (size_t temp = n; temp > 1U; temp >>= 1) levels++; + if (static_cast(1U) << levels != n) + throw std::domain_error("Length is not a power of 2"); + + ComplexArray expTable = mkExpTable(n, inverse); +#pragma omp parallel for + for (size_t i = 0; i < n; i++) { + size_t j = reverseBits(i, levels); + if (j > i) { + std::swap(real[i], real[j]); + std::swap(imag[i], imag[j]); + } } - } - - double *real_expTable = expTable.get_real_ptr(); - double *imag_expTable = expTable.get_imag_ptr(); - - for (size_t size = 2; size <= n; size *= 2) { - size_t halfsize = size / 2; - size_t tablestep = n / size; - for (size_t i = 0; i < n; i += size) { - for (size_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) { - double temp_real = real[j + halfsize] * real_expTable[k] - - imag[j + halfsize] * imag_expTable[k]; - double temp_imag = real[j + halfsize] * imag_expTable[k] + - imag[j + halfsize] * real_expTable[k]; - real[j + halfsize] = real[j] - temp_real; - imag[j + halfsize] = imag[j] - temp_imag; - real[j] += temp_real; - imag[j] += temp_imag; - } + + double *real_expTable = expTable.get_real_ptr(); + double *imag_expTable = expTable.get_imag_ptr(); + + for (size_t size = 2; size <= n; size *= 2) { + size_t halfsize = size / 2; + size_t tablestep = n / size; +#pragma omp parallel for + for (size_t i = 0; i < n; i += size) { + for (size_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) { + double temp_real = real[j + halfsize] * real_expTable[k] - + imag[j + halfsize] * imag_expTable[k]; + double temp_imag = real[j + halfsize] * imag_expTable[k] + + imag[j + halfsize] * real_expTable[k]; + real[j + halfsize] = real[j] - temp_real; + imag[j + halfsize] = imag[j] - temp_imag; + real[j] += temp_real; + imag[j] += temp_imag; + } + } + if (size == n) break; } - if (size == n) - break; - } } -void fft::transformBluestein(ComplexArray &x, bool inverse, bool simd) { - size_t n = x.size(); - double *real = x.get_real_ptr(); - double *imag = x.get_imag_ptr(); - size_t m = 1; - while (m / 2 <= n) { - if (m > SIZE_MAX / 2) - throw std::length_error("Vector too large"); - m *= 2; - } - - // Trigonometric table - ComplexArray expTable(n); - double *real_expTable = expTable.get_real_ptr(); - double *imag_expTable = expTable.get_imag_ptr(); - for (size_t i = 0; i < n; i++) { - uintmax_t temp = static_cast(i) * i; - temp %= static_cast(n) * 2; - double angle = (inverse ? M_PI : -M_PI) * temp / n; - expTable[i] = std::polar(1.0, angle); - } - ComplexArray a(m), b(m); - double *real_avec = a.get_real_ptr(); - double *imag_avec = a.get_imag_ptr(); - - - // TODO: Use SIMD - for (size_t i = 0; i < n; i++) { - real_avec[i] = real[i] * real_expTable[i] - imag[i] * imag_expTable[i]; - imag_avec[i] = real[i] * imag_expTable[i] + imag[i] * real_expTable[i]; - // a[i] = Complex(x[i]) * Complex(expTable[i]); - } - - double *real_bvec = b.get_real_ptr(); - double *imag_bvec = b.get_imag_ptr(); - real_bvec[0] = real_expTable[0]; - imag_bvec[0] = imag_expTable[0]; - - for (size_t i = 1; i < n; ++i) { - real_bvec[i] = real_bvec[m - i] = real_expTable[i]; - imag_bvec[i] = imag_bvec[m - i] = -imag_expTable[i]; - } - - convolove(a, b, simd); - // x = c * expTable; - for (size_t i = 0; i < n; i++) { - real[i] = a.get_real_ptr()[i] * real_expTable[i] - a.get_imag_ptr()[i] * imag_expTable[i]; - imag[i] = a.get_real_ptr()[i] * imag_expTable[i] + a.get_imag_ptr()[i] * real_expTable[i]; - } +static inline void compute_mul_exp(size_t n, const double *real, + const double *imag, + const double *real_expTable, + const double *imag_expTable, + double *real_avec, double *imag_avec, + bool simd) { + if (simd) { +// Process in chunks of 4 using AVX2 +#pragma omp parallel for + for (size_t i = 0; i <= n - 4; i += 4) { + // Load 4 elements from each array + __m256d real_vec = _mm256_loadu_pd(&real[i]); + __m256d imag_vec = _mm256_loadu_pd(&imag[i]); + __m256d real_exp_vec = _mm256_loadu_pd(&real_expTable[i]); + __m256d imag_exp_vec = _mm256_loadu_pd(&imag_expTable[i]); + + // Compute real_avec = real * real_expTable - imag * imag_expTable + __m256d real_result = + _mm256_sub_pd(_mm256_mul_pd(real_vec, real_exp_vec), + _mm256_mul_pd(imag_vec, imag_exp_vec)); + + // Compute imag_avec = real * imag_expTable + imag * real_expTable + __m256d imag_result = + _mm256_add_pd(_mm256_mul_pd(real_vec, imag_exp_vec), + _mm256_mul_pd(imag_vec, real_exp_vec)); + + // Store the results back to memory + _mm256_storeu_pd(&real_avec[i], real_result); + _mm256_storeu_pd(&imag_avec[i], imag_result); + } + + // Handle remaining elements (n % 4 != 0) + for (size_t i = n - n % 4; i < n; i++) { + real_avec[i] = + real[i] * real_expTable[i] - imag[i] * imag_expTable[i]; + imag_avec[i] = + real[i] * imag_expTable[i] + imag[i] * real_expTable[i]; + } + } else { +#pragma omp parallel for + for (size_t i = 0; i < n; i++) { + real_avec[i] = + real[i] * real_expTable[i] - imag[i] * imag_expTable[i]; + imag_avec[i] = + real[i] * imag_expTable[i] + imag[i] * real_expTable[i]; + } + } } +static inline void setup_a_b(size_t n, size_t m, const double *real_expTable, + const double *imag_expTable, double *real_bvec, + double *imag_bvec, bool simd) { + real_bvec[0] = real_expTable[0]; + imag_bvec[0] = imag_expTable[0]; + + if (false) { +#pragma omp parallel for + for (size_t i = 1; i <= n - 4; i += 4) { + __m256d real_exp_vec = _mm256_loadu_pd(&real_expTable[i]); + __m256d imag_exp_vec = _mm256_loadu_pd(&imag_expTable[i]); + + _mm256_storeu_pd(&real_bvec[i], real_exp_vec); + _mm256_storeu_pd( + &real_bvec[m - i - 3], + _mm256_permute4x64_pd(real_exp_vec, _MM_SHUFFLE(0, 1, 2, 3))); + + __m256d neg_imag_exp_vec = + _mm256_sub_pd(_mm256_setzero_pd(), imag_exp_vec); + _mm256_storeu_pd(&imag_bvec[i], neg_imag_exp_vec); + _mm256_storeu_pd(&imag_bvec[m - i - 3], + _mm256_permute4x64_pd(neg_imag_exp_vec, + _MM_SHUFFLE(0, 1, 2, 3))); + } + + // Handle remaining elements (n % 4 != 0) + for (size_t i = n - n % 4; i < n; ++i) { + real_bvec[i] = real_bvec[m - i] = real_expTable[i]; + imag_bvec[i] = imag_bvec[m - i] = -imag_expTable[i]; + } + } else { +#pragma omp for + for (size_t i = 1; i < n; ++i) { + real_bvec[i] = real_bvec[m - i] = real_expTable[i]; + imag_bvec[i] = imag_bvec[m - i] = -imag_expTable[i]; + } + } +} + +void fft::transformBluestein(ComplexArray &x, bool inverse) { + size_t n = x.size(); + double *real = x.get_real_ptr(); + double *imag = x.get_imag_ptr(); + size_t m = 1; + while (m / 2 <= n) { + if (m > SIZE_MAX / 2) throw std::length_error("Vector too large"); + m *= 2; + } + + // Trigonometric table + ComplexArray expTable(n); + double *real_expTable = expTable.get_real_ptr(); + double *imag_expTable = expTable.get_imag_ptr(); +#pragma omp parallel for + for (size_t i = 0; i < n; i++) { + uintmax_t temp = static_cast(i) * i; + temp %= static_cast(n) * 2; + double angle = (inverse ? M_PI : -M_PI) * temp / n; + expTable[i] = std::polar(1.0, angle); + } + ComplexArray a(m), b(m); + double *real_avec = a.get_real_ptr(); + double *imag_avec = a.get_imag_ptr(); + + // TODO: Use SIMD + compute_mul_exp(n, real, imag, real_expTable, imag_expTable, real_avec, + imag_avec, simd); + + double *real_bvec = b.get_real_ptr(); + double *imag_bvec = b.get_imag_ptr(); + setup_a_b(n, m, real_expTable, imag_expTable, real_bvec, imag_bvec, simd); + + convolove(a, b); + // x = c * expTable; + + // for (size_t i = 0; i < n; i++) { + // real[i] = a.get_real_ptr()[i] * real_expTable[i] - + // a.get_imag_ptr()[i] * imag_expTable[i]; + // imag[i] = a.get_real_ptr()[i] * imag_expTable[i] + + // a.get_imag_ptr()[i] * real_expTable[i]; + // } + compute_mul_exp(n, real_avec, imag_avec, real_expTable, imag_expTable, real, + imag, simd); +} // compute the convolution of two complex arrays and set the result to x -void fft::convolove(ComplexArray &_x, const ComplexArray &_y, bool simd) { - size_t n = _x.size(); - if (n != _y.size()) - throw std::domain_error("Mismatched lengths"); - - ComplexArray& x = _x; - ComplexArray y = _y; - - transform(x, false, simd); - transform(y, false, simd); - - x.multiply(y, simd); - transform(x, true, simd); - x.scale(1.0 / n, simd); +void fft::convolove(ComplexArray &_x, const ComplexArray &_y) { + size_t n = _x.size(); + if (n != _y.size()) throw std::domain_error("Mismatched lengths"); + + ComplexArray &x = _x; + ComplexArray y = _y; + + transform(x, false); + transform(y, false); + + x.multiply(y, simd); + transform(x, true); + x.scale(1.0 / n, simd); } -void fft::transform(ComplexArray &x, bool inverse, bool simd) { - size_t n = x.size(); - if (n == 0) - return; - else if ((n & (n - 1)) == 0) // Is power of 2 - transformRadix2(x, inverse, simd); - else // More complicated algorithm for arbitrary sizes - transformBluestein(x, inverse, simd); +void fft::transform(ComplexArray &x, bool inverse) { + size_t n = x.size(); + if (n == 0) + return; + else if ((n & (n - 1)) == 0) // Is power of 2 + transformRadix2(x, inverse); + else // More complicated algorithm for arbitrary sizes + transformBluestein(x, inverse); } static size_t reverseBits(size_t val, int width) { - size_t result = 0; - for (int i = 0; i < width; i++, val >>= 1) - result = (result << 1) | (val & 1U); - return result; + auto n = val; +#if __SIZEOF_SIZE_T__ == 8 + n = ((n & 0xffffffff00000000ULL) >> 32) | + ((n & 0x00000000ffffffffULL) << 32); + n = ((n & 0xffff0000ffff0000ULL) >> 16) | + ((n & 0x0000ffff0000ffffULL) << 16); + n = ((n & 0xff00ff00ff00ff00ULL) >> 8) | ((n & 0x00ff00ff00ff00ffULL) << 8); + n = ((n & 0xf0f0f0f0f0f0f0f0ULL) >> 4) | ((n & 0x0f0f0f0f0f0f0f0fULL) << 4); + n = ((n & 0xccccccccccccccccULL) >> 2) | ((n & 0x3333333333333333ULL) << 2); + n = ((n & 0xaaaaaaaaaaaaaaaaULL) >> 1) | ((n & 0x5555555555555555ULL) << 1); + return n >> (64 - width); +#elif __SIZEOF_SIZE_T__ == 4 + n = ((n & 0xffff0000) >> 16) | ((n & 0x0000ffff) << 16); + n = ((n & 0xff00ff00) >> 8) | ((n & 0x00ff00ff) << 8); + n = ((n & 0xf0f0f0f0) >> 4) | ((n & 0x0f0f0f0f) << 4); + n = ((n & 0xcccccccc) >> 2) | ((n & 0x33333333) << 2); + n = ((n & 0xaaaaaaaa) >> 1) | ((n & 0x55555555) << 1); + return n >> (32 - width); +#endif } + +// static size_t reverseBits(size_t val, int width) { +// size_t result = 0; +// for (int i = 0; i < width; i++, val >>= 1) +// result = (result << 1) | (val & 1U); +// return result; +// } \ No newline at end of file diff --git a/src/fft.hpp b/src/fft.hpp index 868eee7..d64a452 100644 --- a/src/fft.hpp +++ b/src/fft.hpp @@ -3,11 +3,19 @@ #include "complex_array.hpp" namespace fft { -void transformRadix2(ComplexArray &x, bool inverse, bool simd = false); -void transformBluestein(ComplexArray &x, bool inverse, bool simd = false); +static bool simd = false; -void transform(ComplexArray &x, bool inverse, bool simd = false); +void transformRadix2(ComplexArray &x, bool inverse); + +void transformBluestein(ComplexArray &x, bool inverse); + +void transform(ComplexArray &x, bool inverse); + +void convolove(ComplexArray &x, const ComplexArray &y); + +void set_num_threads(size_t n); + +void set_simd(bool s); -void convolove(ComplexArray &x, const ComplexArray &y, bool simd = false); } // namespace fft diff --git a/test_dct.py b/test_dct.py index 092c6f7..d67a68a 100644 --- a/test_dct.py +++ b/test_dct.py @@ -3,26 +3,62 @@ import fdct import scipy.fft as fft import time +import os # Parameters -test_size_seq = [10, 64, 127, 256, 1023, 2048, 4097, 8192, 8193] -tolerance = 1e-6 # Tolerance for individual element comparison -simd = [False, True] # SIMD flag +test_size_seq_power_of_2 = [65536, 131072, 262144, 524288, 1048576, 2097152] +test_size_seq_odd = [x + 1 for x in test_size_seq_power_of_2] + +tolerance = 1e-6 # Tolerance for element comparison +num_threads = [1, 2, 4, 8] +simd = [True, False] + +# Combined parameter set: (size, is_power_of_two) +combined_sizes = [(size, True) for size in test_size_seq_power_of_2] + \ + [(size, False) for size in test_size_seq_odd] # List of test functions and their corresponding references test_cases = [ - ("dct_naive", fdct.dct_naive, fft.dct), - ("idct_naive", fdct.idct_naive, fft.idct), + # ("dct_naive", fdct.dct_naive, fft.dct), + # ("idct_naive", fdct.idct_naive, fft.idct), ("fdct", fdct.fdct, fft.dct), ("ifdct", fdct.ifdct, fft.idct), ] -@pytest.mark.parametrize("size", test_size_seq) +def write_csv_result(filename, header, row): + """Write a single row of results to a CSV file, adding a header if the file doesn't exist.""" + file_exists = os.path.isfile(filename) + with open(filename, "a", newline="") as f: + if not file_exists: + f.write(header + "\n") + f.write(row + "\n") + +@pytest.mark.parametrize("size,is_power_of_two", combined_sizes) @pytest.mark.parametrize("name, func, ref_func", test_cases) +@pytest.mark.parametrize("num_thread", num_threads) @pytest.mark.parametrize("simd", simd) -def test_transform(name, func, ref_func, size, simd): +def test_transform(name, func, ref_func, size, is_power_of_two, num_thread, simd): + # Generate input data inp = np.random.rand(size) + + # Set number of threads for fdct + fdct.set_num_threads(num_thread) + + fdct.set_simd(simd) + + # Perform reference transform (ground truth) gt = ref_func(inp) - ans = func(inp, simd) + + # Measure execution time + start_time = time.time() + ans = func(inp) + duration = time.time() - start_time + + # Verify correctness np.testing.assert_allclose(ans, gt, atol=tolerance, rtol=0) + # Write results to a single CSV file + filename = "combined_results.csv" + header = "function_name,size,num_threads,simd,is_power_of_two,duration_s" + row = f"{name},{size},{num_thread},{simd},{is_power_of_two},{duration}" + write_csv_result(filename, header, row) \ No newline at end of file