From bc910702ae3c3f21cc6006cb173a561c89ec304a Mon Sep 17 00:00:00 2001
From: cfmc30 <cfmc.cs13@nycu.edu.tw>
Date: Mon, 9 Dec 2024 15:51:20 +0000
Subject: [PATCH 01/16] feat: New impl

---
 src/FftComplex.cpp     | 149 +++++++++++++++++++++++++++
 src/FftComplex.hpp     |  61 +++++++++++
 src/FftComplexTest.cpp | 177 +++++++++++++++++++++++++++++++
 src/complex.hpp        |  92 +++++++++++++++++
 src/fdct.cpp           |  98 ------------------
 src/fdct.hpp           | 229 +++++++++++++++++++++++++++++++++++++++++
 src/fdct_module.cpp    |  15 +++
 src/fft.cpp            | 143 +++++++++++++++++++++++++
 src/fft.hpp            |  16 +++
 test_dct.py            |  28 +++--
 10 files changed, 903 insertions(+), 105 deletions(-)
 create mode 100644 src/FftComplex.cpp
 create mode 100644 src/FftComplex.hpp
 create mode 100644 src/FftComplexTest.cpp
 create mode 100644 src/complex.hpp
 delete mode 100644 src/fdct.cpp
 create mode 100644 src/fdct.hpp
 create mode 100644 src/fdct_module.cpp
 create mode 100644 src/fft.cpp
 create mode 100644 src/fft.hpp
diff --git a/src/FftComplex.cpp b/src/FftComplex.cpp
new file mode 100644
index 0000000..4f842a0
--- /dev/null
+++ b/src/FftComplex.cpp
@@ -0,0 +1,149 @@
+/* 
+ * Free FFT and convolution (C++)
+ * 
+ * Copyright (c) 2021 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <utility>
+#include "FftComplex.hpp"
+
+using std::complex;
+using std::size_t;
+using std::uintmax_t;
+using std::vector;
+
+
+// Private function prototypes
+static size_t reverseBits(size_t val, int width);
+
+
+void Fft::transform(vector<complex<double> > &vec, bool inverse) {
+	size_t n = vec.size();
+	if (n == 0)
+		return;
+	else if ((n & (n - 1)) == 0)  // Is power of 2
+		transformRadix2(vec, inverse);
+	else  // More complicated algorithm for arbitrary sizes
+		transformBluestein(vec, inverse);
+}
+
+
+void Fft::transformRadix2(vector<complex<double> > &vec, bool inverse) {
+	// Length variables
+	size_t n = vec.size();
+	int levels = 0;  // Compute levels = floor(log2(n))
+	for (size_t temp = n; temp > 1U; temp >>= 1)
+		levels++;
+	if (static_cast<size_t>(1U) << levels != n)
+		throw std::domain_error("Length is not a power of 2");
+	
+	// Trigonometric table
+	vector<complex<double> > expTable(n / 2);
+	for (size_t i = 0; i < n / 2; i++)
+		expTable[i] = std::polar(1.0, (inverse ? 2 : -2) * M_PI * i / n);
+	
+	// Bit-reversed addressing permutation
+	for (size_t i = 0; i < n; i++) {
+		size_t j = reverseBits(i, levels);
+		if (j > i)
+			std::swap(vec[i], vec[j]);
+	}
+	
+	// Cooley-Tukey decimation-in-time radix-2 FFT
+	for (size_t size = 2; size <= n; size *= 2) {
+		size_t halfsize = size / 2;
+		size_t tablestep = n / size;
+		for (size_t i = 0; i < n; i += size) {
+			for (size_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) {
+				complex<double> temp = vec[j + halfsize] * expTable[k];
+				vec[j + halfsize] = vec[j] - temp;
+				vec[j] += temp;
+			}
+		}
+		if (size == n)  // Prevent overflow in 'size *= 2'
+			break;
+	}
+}
+
+
+void Fft::transformBluestein(vector<complex<double> > &vec, bool inverse) {
+	// Find a power-of-2 convolution length m such that m >= n * 2 + 1
+	size_t n = vec.size();
+	size_t m = 1;
+	while (m / 2 <= n) {
+		if (m > SIZE_MAX / 2)
+			throw std::length_error("Vector too large");
+		m *= 2;
+	}
+	
+	// Trigonometric table
+	vector<complex<double> > expTable(n);
+	for (size_t i = 0; i < n; i++) {
+		uintmax_t temp = static_cast<uintmax_t>(i) * i;
+		temp %= static_cast<uintmax_t>(n) * 2;
+		double angle = (inverse ? M_PI : -M_PI) * temp / n;
+		expTable[i] = std::polar(1.0, angle);
+	}
+	
+	// Temporary vectors and preprocessing
+	vector<complex<double> > avec(m);
+	for (size_t i = 0; i < n; i++)
+		avec[i] = vec[i] * expTable[i];
+	vector<complex<double> > bvec(m);
+	bvec[0] = expTable[0];
+	for (size_t i = 1; i < n; i++)
+		bvec[i] = bvec[m - i] = std::conj(expTable[i]);
+	
+	// Convolution
+	vector<complex<double> > cvec = convolve(std::move(avec), std::move(bvec));
+	
+	// Postprocessing
+	for (size_t i = 0; i < n; i++)
+		vec[i] = cvec[i] * expTable[i];
+}
+
+
+vector<complex<double> > Fft::convolve(
+		vector<complex<double> > xvec,
+		vector<complex<double> > yvec) {
+	
+	size_t n = xvec.size();
+	if (n != yvec.size())
+		throw std::domain_error("Mismatched lengths");
+	transform(xvec, false);
+	transform(yvec, false);
+	for (size_t i = 0; i < n; i++)
+		xvec[i] *= yvec[i];
+	transform(xvec, true);
+	for (size_t i = 0; i < n; i++)  // Scaling (because this FFT implementation omits it)
+		xvec[i] /= static_cast<double>(n);
+	return xvec;
+}
+
+
+static size_t reverseBits(size_t val, int width) {
+	size_t result = 0;
+	for (int i = 0; i < width; i++, val >>= 1)
+		result = (result << 1) | (val & 1U);
+	return result;
+}
diff --git a/src/FftComplex.hpp b/src/FftComplex.hpp
new file mode 100644
index 0000000..24eeabb
--- /dev/null
+++ b/src/FftComplex.hpp
@@ -0,0 +1,61 @@
+/* 
+ * Free FFT and convolution (C++)
+ * 
+ * Copyright (c) 2021 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#pragma once
+
+#include <complex>
+#include <vector>
+using Complex = std::complex<double>;
+
+namespace Fft {
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector can have any length. This is a wrapper function. The inverse transform does not perform scaling, so it is not a true inverse.
+	 */
+	void transform(std::vector<std::complex<double> > &vec, bool inverse);
+	
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector's length must be a power of 2. Uses the Cooley-Tukey decimation-in-time radix-2 algorithm.
+	 */
+	void transformRadix2(std::vector<std::complex<double> > &vec, bool inverse);
+	
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector can have any length. This requires the convolution function, which in turn requires the radix-2 FFT function.
+	 * Uses Bluestein's chirp z-transform algorithm.
+	 */
+	void transformBluestein(std::vector<std::complex<double> > &vec, bool inverse);
+	
+	
+	/* 
+	 * Computes the circular convolution of the given complex vectors. Each vector's length must be the same.
+	 */
+	std::vector<std::complex<double> > convolve(
+		std::vector<std::complex<double> > xvec,
+		std::vector<std::complex<double> > yvec);
+	
+}
diff --git a/src/FftComplexTest.cpp b/src/FftComplexTest.cpp
new file mode 100644
index 0000000..dd960d1
--- /dev/null
+++ b/src/FftComplexTest.cpp
@@ -0,0 +1,177 @@
+/* 
+ * FFT and convolution test (C++)
+ * 
+ * Copyright (c) 2021 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <utility>
+#include <vector>
+#include "FftComplex.hpp"
+
+using std::complex;
+using std::cout;
+using std::endl;
+using std::vector;
+
+
+// Private function prototypes
+static void testFft(int n);
+static void testConvolution(int n);
+static vector<complex<double> > naiveDft(const vector<complex<double> > &input, bool inverse);
+static vector<complex<double> > naiveConvolve(const vector<complex<double> > &xvec, const vector<complex<double> > &yvec);
+static double log10RmsErr(const vector<complex<double> > &xvec, const vector<complex<double> > &yvec);
+static vector<complex<double> > randomComplexes(int n);
+
+// Mutable global variable
+static double maxLogError = -INFINITY;
+
+// Random number generation
+std::default_random_engine randGen((std::random_device())());
+
+
+/*---- Main and test functions ----*/
+
+int main() {
+	// Test power-of-2 size FFTs
+	for (int i = 0; i <= 12; i++)
+		testFft(1 << i);
+	
+	// Test small size FFTs
+	for (int i = 0; i < 30; i++)
+		testFft(i);
+	
+	// Test diverse size FFTs
+	for (int i = 0, prev = 0; i <= 100; i++) {
+		int n = static_cast<int>(std::lround(std::pow(1500.0, i / 100.0)));
+		if (n > prev) {
+			testFft(n);
+			prev = n;
+		}
+	}
+	
+	// Test power-of-2 size convolutions
+	for (int i = 0; i <= 12; i++)
+		testConvolution(1 << i);
+	
+	// Test diverse size convolutions
+	for (int i = 0, prev = 0; i <= 100; i++) {
+		int n = static_cast<int>(std::lround(std::pow(1500.0, i / 100.0)));
+		if (n > prev) {
+			testConvolution(n);
+			prev = n;
+		}
+	}
+	
+	cout << endl;
+	cout << "Max log err = " << std::setprecision(3) << maxLogError << endl;
+	cout << "Test " << (maxLogError < -10 ? "passed" : "failed") << endl;
+	return EXIT_SUCCESS;
+}
+
+
+static void testFft(int n) {
+	const vector<complex<double> > input = randomComplexes(n);
+	const vector<complex<double> > expect = naiveDft(input, false);
+	vector<complex<double> > actual = input;
+	Fft::transform(actual, false);
+	double err = log10RmsErr(expect, actual);
+	
+	for (auto it = actual.begin(); it != actual.end(); ++it)
+		*it /= n;
+	Fft::transform(actual, true);
+	err = std::max(log10RmsErr(input, actual), err);
+	cout << "fftsize=" << std::setw(4) << std::setfill(' ') << n << "  "
+	     << "logerr=" << std::setw(5) << std::setprecision(3) << std::setiosflags(std::ios::showpoint)
+	     << err << endl;
+}
+
+
+static void testConvolution(int n) {
+	const vector<complex<double> > input0 = randomComplexes(n);
+	const vector<complex<double> > input1 = randomComplexes(n);
+	const vector<complex<double> > expect = naiveConvolve(input0, input1);
+	const vector<complex<double> > actual = Fft::convolve(std::move(input0), std::move(input1));
+	cout << "convsize=" << std::setw(4) << std::setfill(' ') << n << "  "
+	     << "logerr=" << std::setw(5) << std::setprecision(3) << std::setiosflags(std::ios::showpoint)
+	     << log10RmsErr(expect, actual) << endl;
+}
+
+
+/*---- Naive reference computation functions ----*/
+
+static vector<complex<double> > naiveDft(const vector<complex<double> > &input, bool inverse) {
+	int n = static_cast<int>(input.size());
+	vector<complex<double> > output;
+	double coef = (inverse ? 2 : -2) * M_PI / n;
+	for (int k = 0; k < n; k++) {  // For each output element
+		complex<double> sum(0);
+		for (int t = 0; t < n; t++) {  // For each input element
+			double angle = coef * (static_cast<long long>(t) * k % n);
+			sum += input[t] * std::polar(1.0, angle);
+		}
+		output.push_back(sum);
+	}
+	return output;
+}
+
+
+static vector<complex<double> > naiveConvolve(
+		const vector<complex<double> > &xvec, const vector<complex<double> > &yvec) {
+	int n = static_cast<int>(xvec.size());
+	vector<complex<double> > result(n);  // All zeros
+	for (int i = 0; i < n; i++) {
+		for (int j = 0; j < n; j++) {
+			int k = (i + j) % n;
+			result[k] += xvec[i] * yvec[j];
+		}
+	}
+	return result;
+}
+
+
+/*---- Utility functions ----*/
+
+static double log10RmsErr(const vector<complex<double> > &xvec, const vector<complex<double> > &yvec) {
+	int n = static_cast<int>(xvec.size());
+	double err = std::pow(10, -99 * 2);
+	for (int i = 0; i < n; i++)
+		err += std::norm(xvec.at(i) - yvec.at(i));
+	err /= n > 0 ? n : 1;
+	err = std::sqrt(err);  // Now this is a root mean square (RMS) error
+	err = std::log10(err);
+	maxLogError = std::max(err, maxLogError);
+	return err;
+}
+
+
+static vector<complex<double> > randomComplexes(int n) {
+	std::uniform_real_distribution<double> valueDist(-1.0, 1.0);
+	vector<complex<double> > result;
+	for (int i = 0; i < n; i++)
+		result.push_back(complex<double>(valueDist(randGen), valueDist(randGen)));
+	return result;
+}
diff --git a/src/complex.hpp b/src/complex.hpp
new file mode 100644
index 0000000..23a2b65
--- /dev/null
+++ b/src/complex.hpp
@@ -0,0 +1,92 @@
+#ifndef COMPLEX_ARRAY_HPP
+#define COMPLEX_ARRAY_HPP
+#include <algorithm>
+#include <complex>
+#include <stdexcept>
+
+
+using Complex = std::complex<double>;
+
+class ComplexArray {
+   private:
+    double *real;
+    double *imag;
+    size_t array_size;
+
+   public:
+    ComplexArray(size_t n) : array_size(n) {
+        if (n == 0) {
+            real = nullptr;
+            imag = nullptr;
+        } else {
+            real = new double[n];
+            imag = new double[n];
+        }
+    }
+
+    ~ComplexArray() {
+        if (real != nullptr) {
+            delete[] real;
+        }
+        if (imag != nullptr) {
+            delete[] imag;
+        }
+    }
+
+    ComplexArray(const ComplexArray &other) : array_size(other.array_size) {
+        real = new double[array_size];
+        imag = new double[array_size];
+        std::copy(other.real, other.real + array_size, real);
+        std::copy(other.imag, other.imag + array_size, imag);
+    }
+
+    ComplexArray &operator=(const ComplexArray &other) {
+        if (this == &other) {
+            return *this;
+        }
+        if (real != nullptr) {
+            delete[] real;
+        }
+        if (imag != nullptr) {
+            delete[] imag;
+        }
+
+        array_size = other.array_size;
+        real = new double[array_size];
+        imag = new double[array_size];
+
+        for (size_t i = 0; i < array_size; i++) {
+            real[i] = other.real[i];
+            imag[i] = other.imag[i];
+        }
+
+        return *this;
+    }
+
+    Complex operator[](size_t index) const {
+        return Complex(real[index], imag[index]);
+    }
+
+    size_t size() const {
+        return array_size;
+    }
+
+    double* get_real_ptr() {
+        return real;
+    }
+
+    const double* get_real_ptr() const {
+        return real;
+    }
+
+    double* get_imag_ptr() {
+        return imag;
+    }
+
+    const double* get_imag_ptr() const {
+        return imag;
+    }
+
+};
+
+#endif
diff --git a/src/fdct.cpp b/src/fdct.cpp
deleted file mode 100644
index 5163348..0000000
--- a/src/fdct.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-#include <cstddef>
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-#include <iostream>
-
-namespace py = pybind11;
-#define _USE_MATH_DEFINES
-#include <cmath>
-#include <iostream>
-
-using data_t = double;
-
-constexpr data_t SQRT2_1 = 0.7071067811865475;
-constexpr data_t ONE_SUB_SQRT2_1 = 1 - 0.7071067811865475;
-
-static inline data_t e(int n) {
-  // if (n == 0)
-  //   return SQRT2_1;
-  return 1.0f;
-}
-
-
-static inline data_t dct_naive_single(data_t *buf, size_t buf_size, size_t i) {
-  data_t sum = 0.0f;
-  for (size_t j = 0; j < buf_size; j++) {
-    // sum += buf[j] * cos((2 * j + 1) * i * M_PI / (2 * buf_size));
-    sum += buf[j] * cos(M_PI / buf_size * i * (j + 0.5));
-  }
-  return sum;
-}
-
-static inline data_t idct_naive_single(data_t *buf, size_t buf_size, size_t i) {
-  data_t sum = 0.0;
-  for (size_t j = 0; j < buf_size; j++) {
-    sum += buf[j] * cos(M_PI * i * (2 * j + 1.0) / (2 * buf_size));
-  }
-  return sum;
-}
-
-void dct_naive(data_t *buf, size_t buf_size, data_t *out) {
-  
-  for (int i = 0; i < buf_size; i++) {
-    out[i] = 2.0 * e(i) * dct_naive_single(buf, buf_size, i);
-  }
-}
-
-void idct_naive(data_t *buf, size_t buf_size, data_t *out) {
-  
-  for (int i = 0; i < buf_size; i++) {
-    out[i] = (buf[0] + 2 * idct_naive_single(buf, buf_size, i)) / buf_size;
-  }
-}
-
-double err(data_t* buf1, data_t* buf2, size_t buf_size) {
-  double sum = 0.0;
-  for(int i = 0; i < buf_size; i++)
-    sum += abs(buf1[i] - buf2[i]);
-  return sum;
-}
-
-py::array fct(py::array_t<double, py::array::c_style | py::array::forcecast>  in_arr) {
-  py::buffer_info buf_info = in_arr.request();
-  double *ptr = static_cast<double*>(buf_info.ptr);
-  size_t buf_size = buf_info.size;
-
-  std::cout << "fct: Get array with " << buf_info.size << std::endl;
-  std::cout << "buf: " << ptr[0] << " " << ptr[1] << std::endl;
-  
-  py::array_t<double> out_arr(buf_info.shape);
-
-  double *out_buf = static_cast<double*>(out_arr.request(true).ptr);
-  dct_naive(ptr, buf_size, out_buf);
-  
-  return out_arr;
-}
-
-py::array ifct(py::array_t<double, py::array::c_style | py::array::forcecast>  in_arr) {
-  py::buffer_info buf_info = in_arr.request();
-  double *ptr = static_cast<double*>(buf_info.ptr);
-  size_t buf_size = buf_info.size;
-
-  std::cout << "ifct: Get array with " << buf_info.size << std::endl;
-  std::cout << "buf: " << ptr[0] << " " << ptr[1] << std::endl;
-  
-  py::array_t<double> out_arr(buf_info.shape);
-
-  double *out_buf = static_cast<double*>(out_arr.request(true).ptr);
-  idct_naive(ptr, buf_size, out_buf);
-  
-  return out_arr;
-
-}
-
-PYBIND11_MODULE(fdct, m) {
-  m.def("fct", &fct, "Compute fast discrete cosine transformation.");
-  m.def("ifct", &ifct, "Compute inverse fast discrete cosine transformation.");
-}
diff --git a/src/fdct.hpp b/src/fdct.hpp
new file mode 100644
index 0000000..e83eaa5
--- /dev/null
+++ b/src/fdct.hpp
@@ -0,0 +1,229 @@
+#include <cassert>
+#include <fftw3.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <cmath>
+#include <complex>
+#include <cstddef>
+
+#include "FftComplex.hpp"
+#include "complex.hpp"
+#include "fft.hpp"
+
+namespace py = pybind11;
+#define _USE_MATH_DEFINES
+
+using data_t = double;
+
+constexpr data_t SQRT2_1 = 0.7071067811865475;
+constexpr data_t ONE_SUB_SQRT2_1 = 1 - 0.7071067811865475;
+
+static inline data_t e(int n) { return 1.0f; }
+
+ComplexArray reorderInput(const ComplexArray &x) {
+  size_t N = x.size();
+  ComplexArray u(N);
+  const double *x_real = x.get_real_ptr();
+  const double *x_imag = x.get_imag_ptr();
+  double *u_real = u.get_real_ptr();
+  double *u_imag = u.get_imag_ptr();
+
+  for (size_t n = 0; n < N / 2; ++n) {
+    u_real[n] = x_real[2 * n];
+    u_imag[n] = x_imag[2 * n];
+    u_real[N - n - 1] = x_real[2 * n + 1];
+    u_imag[N - n - 1] = x_imag[2 * n + 1];
+  }
+  return u;
+}
+
+std::vector<double> reorderOutput(const std::vector<double> &x) {
+  size_t N = x.size();
+  std::vector<double> reordered(N);
+
+  for (size_t n = 0; n < N / 2; ++n) {
+    reordered[2 * n] = x[n];
+    reordered[2 * n + 1] = x[N - n - 1];
+  }
+  return reordered;
+}
+
+// Function to compute the Fast Cosine Transform
+std::vector<double> fct_inner(ComplexArray &x) {
+  size_t N = x.size();
+  std::vector<double> dct(N);
+  auto u = reorderInput(x);
+  ComplexArray fft_in(N);
+  std::vector<std::complex<double> > vc;
+  // Fill FFT input array
+  for (size_t n = 0; n < N; ++n) {
+    vc.push_back(std::complex<double>(u.get_real_ptr()[n], u.get_imag_ptr()[n]));
+    fft_in.get_real_ptr()[n] = u.get_real_ptr()[n];
+    fft_in.get_imag_ptr()[n] = u.get_imag_ptr()[n];
+  }
+  
+  // Perform FFT
+  fft::transform(fft_in, false);
+  Fft::transform(vc, false);
+
+  for (int n = 0; n < N; ++n) {
+    std::cout << fft_in.get_real_ptr()[n] << " " << vc[n].real() << std::endl;
+  }
+  
+  // Compute the DCT using the FFT output
+  for (size_t k = 0; k < N; ++k) {
+    double real = fft_in[k].real();
+    double imag = fft_in[k].imag();
+    dct[k] =
+        (real * cos(M_PI * k / (2 * N)) - imag * sin(-M_PI * k / (2 * N))) * 2;
+  }
+  return dct;
+}
+
+// Wrapper function to accept and return NumPy arrays
+py::array
+fdct(py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
+  // Request a buffer from the input array
+  py::buffer_info buf = in_arr.request();
+  if (buf.ndim != 1) {
+    throw std::runtime_error("Input array must be 1-dimensional.");
+  }
+
+  // Convert input NumPy array to std::vector
+  size_t N = buf.shape[0];
+  ComplexArray input_data(N);
+  std::copy(static_cast<double *>(buf.ptr), static_cast<double *>(buf.ptr) + N,
+            input_data.get_real_ptr());
+  std::fill(input_data.get_imag_ptr(), input_data.get_imag_ptr() + N, 0.0);
+  // Perform FCT
+  std::vector<double> result = fct_inner(input_data);
+
+  // Convert result back to NumPy array
+  return py::array_t<double>(result.size(), result.data());
+}
+
+std::vector<double> idct_inner(const std::vector<double> &c) {
+  size_t N = c.size();
+  std::vector<double> x(N);
+  auto u = reorderOutput(c);
+
+  // Allocate FFTW input/output arrays
+  fftw_complex *fft_in = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
+  fftw_complex *fft_out = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
+
+  // Initialize FFTW plan
+  fftw_plan plan =
+      fftw_plan_dft_1d(N, fft_in, fft_out, FFTW_BACKWARD, FFTW_ESTIMATE);
+
+  // Fill FFTW input array
+  for (size_t n = 0; n < N; ++n) {
+    fft_in[n][0] = u[n]; // Real part
+    fft_in[n][1] = 0.0;  // Imaginary part
+  }
+
+  // Execute IFFT
+  fftw_execute(plan);
+
+  // Compute the IDCT using the IFFT output
+  for (size_t k = 0; k < N; ++k) {
+    double real = fft_out[k][0];
+    double imag = fft_out[k][1];
+    x[k] =
+        (real * cos(M_PI * k / (2.0 * N)) + imag * sin(M_PI * k / (2.0 * N))) /
+        N;
+  }
+
+  // Free FFTW resources
+  fftw_destroy_plan(plan);
+  fftw_free(fft_in);
+  fftw_free(fft_out);
+
+  return x;
+}
+
+// Wrapper function to accept and return NumPy arrays
+py::array
+ifdct(py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
+  // Request a buffer from the input array
+  py::buffer_info buf = in_arr.request();
+  if (buf.ndim != 1) {
+    throw std::runtime_error("Input array must be 1-dimensional.");
+  }
+
+  // Convert input NumPy array to std::vector
+  size_t N = buf.shape[0];
+  std::vector<double> input_data(static_cast<double *>(buf.ptr),
+                                 static_cast<double *>(buf.ptr) + N);
+
+  // Perform IDCT
+  std::vector<double> result = idct_inner(input_data);
+
+  // Convert result back to NumPy array
+  return py::array_t<double>(result.size(), result.data());
+}
+
+static inline data_t dct_naive_single(data_t *buf, size_t buf_size, size_t i) {
+  data_t sum = 0.0f;
+  for (size_t j = 0; j < buf_size; j++) {
+    // sum += buf[j] * cos((2 * j + 1) * i * M_PI / (2 * buf_size));
+    sum += buf[j] * cos(M_PI / buf_size * i * (j + 0.5));
+  }
+  return sum;
+}
+
+static inline data_t idct_naive_single(data_t *buf, size_t buf_size, size_t i) {
+  data_t sum = 0.0;
+  for (size_t j = 0; j < buf_size; j++) {
+    sum += buf[j] * cos(M_PI * i * (2 * j + 1.0) / (2 * buf_size));
+  }
+  return sum;
+}
+
+void dct_naive_inner(data_t *buf, size_t buf_size, data_t *out) {
+  for (int i = 0; i < buf_size; i++) {
+    out[i] = 2.0 * e(i) * dct_naive_single(buf, buf_size, i);
+  }
+}
+
+void idct_naive_inner(data_t *buf, size_t buf_size, data_t *out) {
+  for (int i = 0; i < buf_size; i++) {
+    out[i] = (buf[0] + 2 * idct_naive_single(buf, buf_size, i)) / buf_size;
+  }
+}
+
+double err(data_t *buf1, data_t *buf2, size_t buf_size) {
+  double sum = 0.0;
+  for (int i = 0; i < buf_size; i++)
+    sum += abs(buf1[i] - buf2[i]);
+  return sum;
+}
+
+py::array dct_naive(
+    py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
+  py::buffer_info buf_info = in_arr.request();
+  double *ptr = static_cast<double *>(buf_info.ptr);
+  size_t buf_size = buf_info.size;
+
+  py::array_t<double> out_arr(buf_info.shape);
+
+  double *out_buf = static_cast<double *>(out_arr.request(true).ptr);
+  dct_naive_inner(ptr, buf_size, out_buf);
+
+  return out_arr;
+}
+
+py::array idct_naive(
+    py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
+  py::buffer_info buf_info = in_arr.request();
+  double *ptr = static_cast<double *>(buf_info.ptr);
+  size_t buf_size = buf_info.size;
+
+  py::array_t<double> out_arr(buf_info.shape);
+
+  double *out_buf = static_cast<double *>(out_arr.request(true).ptr);
+  idct_naive_inner(ptr, buf_size, out_buf);
+
+  return out_arr;
+}
diff --git a/src/fdct_module.cpp b/src/fdct_module.cpp
new file mode 100644
index 0000000..9a0135a
--- /dev/null
+++ b/src/fdct_module.cpp
@@ -0,0 +1,15 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+#include "fdct.hpp"
+
+namespace py = pybind11;
+#define _USE_MATH_DEFINES
+#include <cmath>
+
+PYBIND11_MODULE(fdct, m) {
+  m.def("dct_naive", &dct_naive, "Compute fast discrete cosine transformation.");
+  m.def("idct_naive", &idct_naive, "Compute inverse fast discrete cosine transformation.");
+  m.def("fdct", &fdct, "");
+  m.def("ifdct", &ifdct, "");
+}
diff --git a/src/fft.cpp b/src/fft.cpp
new file mode 100644
index 0000000..40f0948
--- /dev/null
+++ b/src/fft.cpp
@@ -0,0 +1,143 @@
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdlib>
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+
+#include "complex.hpp"
+#include "fft.hpp"
+
+static size_t reverseBits(size_t val, int width);
+
+void fft::transformRadix2(ComplexArray &x, bool inverse) {
+  int levels = 0;
+  size_t n = x.size();
+  double *real = x.get_imag_ptr();
+  double *imag = x.get_real_ptr();
+
+  for (size_t temp = n; temp > 1U; temp >>= 1)
+    levels++;
+  if (static_cast<size_t>(1U) << levels != n)
+    throw std::domain_error("Length is not a power of 2");
+
+  double *real_expTable = new double[n / 2];
+  double *imag_expTable = new double[n / 2];
+
+  for (size_t i = 0; i < n / 2; i++) {
+    double angle = (inverse ? 2 : -2) * M_PI * i / n;
+    real_expTable[i] = cos(angle);
+    imag_expTable[i] = sin(angle);
+  }
+
+  for (size_t i = 0; i < n; i++) {
+    size_t j = reverseBits(i, levels);
+    if (j > i) {
+      std::swap(real[i], real[j]);
+      std::swap(imag[i], imag[j]);
+    }
+  }
+
+  for (size_t size = 2; size <= n; size *= 2) {
+    size_t halfsize = size / 2;
+    size_t tablestep = n / size;
+    for (size_t i = 0; i < n; i += size) {
+      for (size_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) {
+        double temp_real = real[j + halfsize] * real_expTable[k] -
+                           imag[j + halfsize] * imag_expTable[k];
+        double temp_imag = real[j + halfsize] * imag_expTable[k] +
+                           imag[j + halfsize] * real_expTable[k];
+        real[j + halfsize] = real[j] - temp_real;
+        imag[j + halfsize] = imag[j] - temp_imag;
+        real[j] += temp_real;
+        imag[j] += temp_imag;
+      }
+    }
+    if (size == n)
+      break;
+  }
+}
+
+void fft::transformBluestein(ComplexArray &x, bool inverse) {
+  size_t n = x.size();
+  double *real = x.get_imag_ptr();
+  double *imag = x.get_real_ptr();
+  size_t m = 1;
+  while (m / 2 <= n) {
+    if (m > SIZE_MAX / 2)
+      throw std::length_error("Vector too large");
+    m *= 2;
+  }
+
+  // Trigonometric table
+  double *real_expTable = new double[n];
+  double *imag_expTable = new double[n];
+  for (size_t i = 0; i < n; i++) {
+    uintmax_t temp = static_cast<uintmax_t>(i) * i;
+    temp %= static_cast<uintmax_t>(n) * 2;
+    double angle = (inverse ? M_PI : -M_PI) * temp / n;
+    real_expTable[i] = cos(angle);
+    imag_expTable[i] = sin(angle);
+  }
+  
+  double *real_avec = new double[m];
+  double *imag_avec = new double[m];
+  for (size_t i = 0; i < n; i++) {
+    real_avec[i] = real[i] * real_expTable[i] - imag[i] * imag_expTable[i];
+    imag_avec[i] = real[i] * imag_expTable[i] + imag[i] * real_expTable[i];
+  }
+  double *real_bvec = new double[m];
+  double *imag_bvec = new double[m];
+  real_bvec[0] = real_expTable[0];
+  imag_bvec[0] = imag_expTable[0];
+  for (size_t i = 1; i < n; ++i) {
+    real_bvec[i] = real_bvec[m - i] = real_expTable[i];
+    imag_bvec[i] = imag_bvec[m - i] = imag_expTable[i];
+  }
+
+  ComplexArray c = convolove(, y);
+}
+
+ComplexArray fft::convolove(ComplexArray x, ComplexArray y) {
+  size_t n = x.size();
+  if (n != y.size())
+    throw std::domain_error("Mismatched lengths");
+  transform(x, false);
+  transform(y, false);
+
+  double *xr = x.get_real_ptr();
+  double *yr = y.get_real_ptr();
+  double *xi = x.get_imag_ptr();
+  double *yi = y.get_imag_ptr();
+  
+  for (size_t i = 0; i < n; ++i) {
+    double xrn = xr[i] * yr[i] - xi[i] * yi[i];
+    double xin = xr[i] * yi[i] + xi[i] * yr[i];
+    xr[i] = xrn;
+    xi[i] = xin;
+  }
+  transform(x, true);
+  for (size_t i = 0; i < n; ++i) {
+    xr[i] /= static_cast<double>(n);
+    xi[i] /= static_cast<double>(n);
+  }
+  return x;
+}
+
+void fft::transform(ComplexArray &x, bool inverse) {
+  size_t n = x.size();
+  if (n == 0)
+    return;
+  else if ((n & (n - 1)) == 0) // Is power of 2
+    transformRadix2(x, inverse);
+  else // More complicated algorithm for arbitrary sizes
+    transformBluestein(x, inverse);
+}
+
+static size_t reverseBits(size_t val, int width) {
+  size_t result = 0;
+  for (int i = 0; i < width; i++, val >>= 1)
+    result = (result << 1) | (val & 1U);
+  return result;
+}
diff --git a/src/fft.hpp b/src/fft.hpp
new file mode 100644
index 0000000..8813af0
--- /dev/null
+++ b/src/fft.hpp
@@ -0,0 +1,16 @@
+#ifndef FFT_H
+#define FFT_H
+
+#include "complex.hpp"
+
+namespace fft {
+    void transformRadix2(ComplexArray &x, bool inverse);
+
+    void transformBluestein(ComplexArray &x, bool inverse);
+
+    void transform(ComplexArray &x, bool inverse);
+
+    ComplexArray convolove(const ComplexArray &x, const ComplexArray &y);
+}
+
+#endif  // FFT_H
diff --git a/test_dct.py b/test_dct.py
index a8707cc..3ed77ee 100644
--- a/test_dct.py
+++ b/test_dct.py
@@ -1,25 +1,39 @@
 import numpy as np
 import fdct
-import scipy
+import scipy.fft as fft
 
 def test_dct():
     inp = np.random.rand(1024)
-    gt = scipy.fft.dct(inp)
-    ans = fdct.fct(inp)
+    gt = fft.dct(inp)
+    ans = fdct.dct_naive(inp)
     s = sum((gt - ans) ** 2) / 1024
     assert s < 0.1
 
 def test_idct():
     inp = np.random.rand(1024)
-    gt = scipy.fft.idct(inp)
-    ans = fdct.ifct(inp)
-    print(gt)
-    print(ans)
+    gt = fft.idct(inp)
+    ans = fdct.idct_naive(inp)
+    
     s = sum((gt - ans) ** 2) / 1024
     assert s < 0.1
     
+def test_fdct():
+    inp = np.random.rand(10)
+    gt = fft.dct(inp)
+    ans = fdct.fdct(inp)
+    s = sum((gt - ans) ** 2) / 10
+    print(gt)
+    print(ans)
+    assert s < 0.1
 
+def test_ifdct():
+    inp = np.random.rand(10)
+    gt = fft.idct(inp)
+    ans = fdct.ifdct(inp)
+    s = sum((gt - ans) ** 2) / 10
+    assert s < 0.1
     
 if __name__ == "__main__":
     test_dct()
     test_idct()
+    test_fdct()

From 8fc4db4d2b3a0755ca09428ce4c26f966345a893 Mon Sep 17 00:00:00 2001
From: cfmc30 <cfmc.cs13@nycu.edu.tw>
Date: Wed, 11 Dec 2024 15:13:15 +0000
Subject: [PATCH 02/16] chore: rearrangement C++ files

---
 src/FftComplex.cpp     | 149 ----------------------------------
 src/FftComplex.hpp     |  61 --------------
 src/FftComplexTest.cpp | 177 -----------------------------------------
 src/complex.hpp        |  92 ---------------------
 src/complex_array.hpp  | 148 ++++++++++++++++++++++++++++++++++
 src/dct_naive.hpp      |  72 +++++++++++++++++
 src/fdct.hpp           | 141 ++++----------------------------
 src/fdct_inner.cpp     |  83 +++++++++++++++++++
 src/fdct_inner.hpp     |  17 ++++
 src/fdct_module.cpp    |   5 +-
 src/fft.cpp            |  49 +++++++-----
 src/fft.hpp            |  17 ++--
 12 files changed, 376 insertions(+), 635 deletions(-)
 delete mode 100644 src/FftComplex.cpp
 delete mode 100644 src/FftComplex.hpp
 delete mode 100644 src/FftComplexTest.cpp
 delete mode 100644 src/complex.hpp
 create mode 100644 src/complex_array.hpp
 create mode 100644 src/dct_naive.hpp
 create mode 100644 src/fdct_inner.cpp
 create mode 100644 src/fdct_inner.hpp

diff --git a/src/FftComplex.cpp b/src/FftComplex.cpp
deleted file mode 100644
index 4f842a0..0000000
--- a/src/FftComplex.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/* 
- * Free FFT and convolution (C++)
- * 
- * Copyright (c) 2021 Project Nayuki. (MIT License)
- * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
- * the Software, and to permit persons to whom the Software is furnished to do so,
- * subject to the following conditions:
- * - The above copyright notice and this permission notice shall be included in
- *   all copies or substantial portions of the Software.
- * - The Software is provided "as is", without warranty of any kind, express or
- *   implied, including but not limited to the warranties of merchantability,
- *   fitness for a particular purpose and noninfringement. In no event shall the
- *   authors or copyright holders be liable for any claim, damages or other
- *   liability, whether in an action of contract, tort or otherwise, arising from,
- *   out of or in connection with the Software or the use or other dealings in the
- *   Software.
- */
-
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <utility>
-#include "FftComplex.hpp"
-
-using std::complex;
-using std::size_t;
-using std::uintmax_t;
-using std::vector;
-
-
-// Private function prototypes
-static size_t reverseBits(size_t val, int width);
-
-
-void Fft::transform(vector<complex<double> > &vec, bool inverse) {
-	size_t n = vec.size();
-	if (n == 0)
-		return;
-	else if ((n & (n - 1)) == 0)  // Is power of 2
-		transformRadix2(vec, inverse);
-	else  // More complicated algorithm for arbitrary sizes
-		transformBluestein(vec, inverse);
-}
-
-
-void Fft::transformRadix2(vector<complex<double> > &vec, bool inverse) {
-	// Length variables
-	size_t n = vec.size();
-	int levels = 0;  // Compute levels = floor(log2(n))
-	for (size_t temp = n; temp > 1U; temp >>= 1)
-		levels++;
-	if (static_cast<size_t>(1U) << levels != n)
-		throw std::domain_error("Length is not a power of 2");
-	
-	// Trigonometric table
-	vector<complex<double> > expTable(n / 2);
-	for (size_t i = 0; i < n / 2; i++)
-		expTable[i] = std::polar(1.0, (inverse ? 2 : -2) * M_PI * i / n);
-	
-	// Bit-reversed addressing permutation
-	for (size_t i = 0; i < n; i++) {
-		size_t j = reverseBits(i, levels);
-		if (j > i)
-			std::swap(vec[i], vec[j]);
-	}
-	
-	// Cooley-Tukey decimation-in-time radix-2 FFT
-	for (size_t size = 2; size <= n; size *= 2) {
-		size_t halfsize = size / 2;
-		size_t tablestep = n / size;
-		for (size_t i = 0; i < n; i += size) {
-			for (size_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) {
-				complex<double> temp = vec[j + halfsize] * expTable[k];
-				vec[j + halfsize] = vec[j] - temp;
-				vec[j] += temp;
-			}
-		}
-		if (size == n)  // Prevent overflow in 'size *= 2'
-			break;
-	}
-}
-
-
-void Fft::transformBluestein(vector<complex<double> > &vec, bool inverse) {
-	// Find a power-of-2 convolution length m such that m >= n * 2 + 1
-	size_t n = vec.size();
-	size_t m = 1;
-	while (m / 2 <= n) {
-		if (m > SIZE_MAX / 2)
-			throw std::length_error("Vector too large");
-		m *= 2;
-	}
-	
-	// Trigonometric table
-	vector<complex<double> > expTable(n);
-	for (size_t i = 0; i < n; i++) {
-		uintmax_t temp = static_cast<uintmax_t>(i) * i;
-		temp %= static_cast<uintmax_t>(n) * 2;
-		double angle = (inverse ? M_PI : -M_PI) * temp / n;
-		expTable[i] = std::polar(1.0, angle);
-	}
-	
-	// Temporary vectors and preprocessing
-	vector<complex<double> > avec(m);
-	for (size_t i = 0; i < n; i++)
-		avec[i] = vec[i] * expTable[i];
-	vector<complex<double> > bvec(m);
-	bvec[0] = expTable[0];
-	for (size_t i = 1; i < n; i++)
-		bvec[i] = bvec[m - i] = std::conj(expTable[i]);
-	
-	// Convolution
-	vector<complex<double> > cvec = convolve(std::move(avec), std::move(bvec));
-	
-	// Postprocessing
-	for (size_t i = 0; i < n; i++)
-		vec[i] = cvec[i] * expTable[i];
-}
-
-
-vector<complex<double> > Fft::convolve(
-		vector<complex<double> > xvec,
-		vector<complex<double> > yvec) {
-	
-	size_t n = xvec.size();
-	if (n != yvec.size())
-		throw std::domain_error("Mismatched lengths");
-	transform(xvec, false);
-	transform(yvec, false);
-	for (size_t i = 0; i < n; i++)
-		xvec[i] *= yvec[i];
-	transform(xvec, true);
-	for (size_t i = 0; i < n; i++)  // Scaling (because this FFT implementation omits it)
-		xvec[i] /= static_cast<double>(n);
-	return xvec;
-}
-
-
-static size_t reverseBits(size_t val, int width) {
-	size_t result = 0;
-	for (int i = 0; i < width; i++, val >>= 1)
-		result = (result << 1) | (val & 1U);
-	return result;
-}
diff --git a/src/FftComplex.hpp b/src/FftComplex.hpp
deleted file mode 100644
index 24eeabb..0000000
--- a/src/FftComplex.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* 
- * Free FFT and convolution (C++)
- * 
- * Copyright (c) 2021 Project Nayuki. (MIT License)
- * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
- * the Software, and to permit persons to whom the Software is furnished to do so,
- * subject to the following conditions:
- * - The above copyright notice and this permission notice shall be included in
- *   all copies or substantial portions of the Software.
- * - The Software is provided "as is", without warranty of any kind, express or
- *   implied, including but not limited to the warranties of merchantability,
- *   fitness for a particular purpose and noninfringement. In no event shall the
- *   authors or copyright holders be liable for any claim, damages or other
- *   liability, whether in an action of contract, tort or otherwise, arising from,
- *   out of or in connection with the Software or the use or other dealings in the
- *   Software.
- */
-
-#pragma once
-
-#include <complex>
-#include <vector>
-using Complex = std::complex<double>;
-
-namespace Fft {
-	
-	/* 
-	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
-	 * The vector can have any length. This is a wrapper function. The inverse transform does not perform scaling, so it is not a true inverse.
-	 */
-	void transform(std::vector<std::complex<double> > &vec, bool inverse);
-	
-	
-	/* 
-	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
-	 * The vector's length must be a power of 2. Uses the Cooley-Tukey decimation-in-time radix-2 algorithm.
-	 */
-	void transformRadix2(std::vector<std::complex<double> > &vec, bool inverse);
-	
-	
-	/* 
-	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
-	 * The vector can have any length. This requires the convolution function, which in turn requires the radix-2 FFT function.
-	 * Uses Bluestein's chirp z-transform algorithm.
-	 */
-	void transformBluestein(std::vector<std::complex<double> > &vec, bool inverse);
-	
-	
-	/* 
-	 * Computes the circular convolution of the given complex vectors. Each vector's length must be the same.
-	 */
-	std::vector<std::complex<double> > convolve(
-		std::vector<std::complex<double> > xvec,
-		std::vector<std::complex<double> > yvec);
-	
-}
diff --git a/src/FftComplexTest.cpp b/src/FftComplexTest.cpp
deleted file mode 100644
index dd960d1..0000000
--- a/src/FftComplexTest.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/* 
- * FFT and convolution test (C++)
- * 
- * Copyright (c) 2021 Project Nayuki. (MIT License)
- * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
- * the Software, and to permit persons to whom the Software is furnished to do so,
- * subject to the following conditions:
- * - The above copyright notice and this permission notice shall be included in
- *   all copies or substantial portions of the Software.
- * - The Software is provided "as is", without warranty of any kind, express or
- *   implied, including but not limited to the warranties of merchantability,
- *   fitness for a particular purpose and noninfringement. In no event shall the
- *   authors or copyright holders be liable for any claim, damages or other
- *   liability, whether in an action of contract, tort or otherwise, arising from,
- *   out of or in connection with the Software or the use or other dealings in the
- *   Software.
- */
-
-#include <algorithm>
-#include <cmath>
-#include <complex>
-#include <cstdlib>
-#include <iomanip>
-#include <iostream>
-#include <random>
-#include <utility>
-#include <vector>
-#include "FftComplex.hpp"
-
-using std::complex;
-using std::cout;
-using std::endl;
-using std::vector;
-
-
-// Private function prototypes
-static void testFft(int n);
-static void testConvolution(int n);
-static vector<complex<double> > naiveDft(const vector<complex<double> > &input, bool inverse);
-static vector<complex<double> > naiveConvolve(const vector<complex<double> > &xvec, const vector<complex<double> > &yvec);
-static double log10RmsErr(const vector<complex<double> > &xvec, const vector<complex<double> > &yvec);
-static vector<complex<double> > randomComplexes(int n);
-
-// Mutable global variable
-static double maxLogError = -INFINITY;
-
-// Random number generation
-std::default_random_engine randGen((std::random_device())());
-
-
-/*---- Main and test functions ----*/
-
-int main() {
-	// Test power-of-2 size FFTs
-	for (int i = 0; i <= 12; i++)
-		testFft(1 << i);
-	
-	// Test small size FFTs
-	for (int i = 0; i < 30; i++)
-		testFft(i);
-	
-	// Test diverse size FFTs
-	for (int i = 0, prev = 0; i <= 100; i++) {
-		int n = static_cast<int>(std::lround(std::pow(1500.0, i / 100.0)));
-		if (n > prev) {
-			testFft(n);
-			prev = n;
-		}
-	}
-	
-	// Test power-of-2 size convolutions
-	for (int i = 0; i <= 12; i++)
-		testConvolution(1 << i);
-	
-	// Test diverse size convolutions
-	for (int i = 0, prev = 0; i <= 100; i++) {
-		int n = static_cast<int>(std::lround(std::pow(1500.0, i / 100.0)));
-		if (n > prev) {
-			testConvolution(n);
-			prev = n;
-		}
-	}
-	
-	cout << endl;
-	cout << "Max log err = " << std::setprecision(3) << maxLogError << endl;
-	cout << "Test " << (maxLogError < -10 ? "passed" : "failed") << endl;
-	return EXIT_SUCCESS;
-}
-
-
-static void testFft(int n) {
-	const vector<complex<double> > input = randomComplexes(n);
-	const vector<complex<double> > expect = naiveDft(input, false);
-	vector<complex<double> > actual = input;
-	Fft::transform(actual, false);
-	double err = log10RmsErr(expect, actual);
-	
-	for (auto it = actual.begin(); it != actual.end(); ++it)
-		*it /= n;
-	Fft::transform(actual, true);
-	err = std::max(log10RmsErr(input, actual), err);
-	cout << "fftsize=" << std::setw(4) << std::setfill(' ') << n << "  "
-	     << "logerr=" << std::setw(5) << std::setprecision(3) << std::setiosflags(std::ios::showpoint)
-	     << err << endl;
-}
-
-
-static void testConvolution(int n) {
-	const vector<complex<double> > input0 = randomComplexes(n);
-	const vector<complex<double> > input1 = randomComplexes(n);
-	const vector<complex<double> > expect = naiveConvolve(input0, input1);
-	const vector<complex<double> > actual = Fft::convolve(std::move(input0), std::move(input1));
-	cout << "convsize=" << std::setw(4) << std::setfill(' ') << n << "  "
-	     << "logerr=" << std::setw(5) << std::setprecision(3) << std::setiosflags(std::ios::showpoint)
-	     << log10RmsErr(expect, actual) << endl;
-}
-
-
-/*---- Naive reference computation functions ----*/
-
-static vector<complex<double> > naiveDft(const vector<complex<double> > &input, bool inverse) {
-	int n = static_cast<int>(input.size());
-	vector<complex<double> > output;
-	double coef = (inverse ? 2 : -2) * M_PI / n;
-	for (int k = 0; k < n; k++) {  // For each output element
-		complex<double> sum(0);
-		for (int t = 0; t < n; t++) {  // For each input element
-			double angle = coef * (static_cast<long long>(t) * k % n);
-			sum += input[t] * std::polar(1.0, angle);
-		}
-		output.push_back(sum);
-	}
-	return output;
-}
-
-
-static vector<complex<double> > naiveConvolve(
-		const vector<complex<double> > &xvec, const vector<complex<double> > &yvec) {
-	int n = static_cast<int>(xvec.size());
-	vector<complex<double> > result(n);  // All zeros
-	for (int i = 0; i < n; i++) {
-		for (int j = 0; j < n; j++) {
-			int k = (i + j) % n;
-			result[k] += xvec[i] * yvec[j];
-		}
-	}
-	return result;
-}
-
-
-/*---- Utility functions ----*/
-
-static double log10RmsErr(const vector<complex<double> > &xvec, const vector<complex<double> > &yvec) {
-	int n = static_cast<int>(xvec.size());
-	double err = std::pow(10, -99 * 2);
-	for (int i = 0; i < n; i++)
-		err += std::norm(xvec.at(i) - yvec.at(i));
-	err /= n > 0 ? n : 1;
-	err = std::sqrt(err);  // Now this is a root mean square (RMS) error
-	err = std::log10(err);
-	maxLogError = std::max(err, maxLogError);
-	return err;
-}
-
-
-static vector<complex<double> > randomComplexes(int n) {
-	std::uniform_real_distribution<double> valueDist(-1.0, 1.0);
-	vector<complex<double> > result;
-	for (int i = 0; i < n; i++)
-		result.push_back(complex<double>(valueDist(randGen), valueDist(randGen)));
-	return result;
-}
diff --git a/src/complex.hpp b/src/complex.hpp
deleted file mode 100644
index 23a2b65..0000000
--- a/src/complex.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-#ifndef COMPLEX_ARRAY_HPP
-#define COMPLEX_ARRAY_HPP
-#include <algorithm>
-#include <complex>
-#include <stdexcept>
-
-
-using Complex = std::complex<double>;
-
-class ComplexArray {
-   private:
-    double *real;
-    double *imag;
-    size_t array_size;
-
-   public:
-    ComplexArray(size_t n) : array_size(n) {
-        if (n == 0) {
-            real = nullptr;
-            imag = nullptr;
-        } else {
-            real = new double[n];
-            imag = new double[n];
-        }
-    }
-
-    ~ComplexArray() {
-        if (real != nullptr) {
-            delete[] real;
-        }
-        if (imag != nullptr) {
-            delete[] imag;
-        }
-    }
-
-    ComplexArray(const ComplexArray &other) : array_size(other.array_size) {
-        real = new double[array_size];
-        imag = new double[array_size];
-        std::copy(other.real, other.real + array_size, real);
-        std::copy(other.imag, other.imag + array_size, imag);
-    }
-
-    ComplexArray &operator=(const ComplexArray &other) {
-        if (this == &other) {
-            return *this;
-        }
-        if (real != nullptr) {
-            delete[] real;
-        }
-        if (imag != nullptr) {
-            delete[] imag;
-        }
-
-        array_size = other.array_size;
-        real = new double[array_size];
-        imag = new double[array_size];
-
-        for (size_t i = 0; i < array_size; i++) {
-            real[i] = other.real[i];
-            imag[i] = other.imag[i];
-        }
-
-        return *this;
-    }
-
-    Complex operator[](size_t index) const {
-        return Complex(real[index], imag[index]);
-    }
-
-    size_t size() const {
-        return array_size;
-    }
-
-    double* get_real_ptr() {
-        return real;
-    }
-
-    const double* get_real_ptr() const {
-        return real;
-    }
-
-    double* get_imag_ptr() {
-        return imag;
-    }
-
-    const double* get_imag_ptr() const {
-        return imag;
-    }
-
-};
-
-#endif
diff --git a/src/complex_array.hpp b/src/complex_array.hpp
new file mode 100644
index 0000000..8d0762c
--- /dev/null
+++ b/src/complex_array.hpp
@@ -0,0 +1,148 @@
+#ifndef COMPLEX_ARRAY_HPP
+#define COMPLEX_ARRAY_HPP
+#include <algorithm>
+#include <complex>
+#include <stdexcept>
+
+using Complex = std::complex<double>;
+
+class ComplexProxy {
+
+  double *real_ptr;
+  double *imag_ptr;
+
+public:
+
+  double &real() { return *real_ptr; }
+  double &imag() { return *imag_ptr; }
+  
+  ComplexProxy(double *real_ptr, double *imag_ptr)
+      : real_ptr(real_ptr), imag_ptr(imag_ptr) {}
+
+  ComplexProxy(ComplexProxy &&rhs) noexcept
+      : real_ptr(rhs.real_ptr), imag_ptr(rhs.imag_ptr) {
+    rhs.real_ptr = nullptr;
+    rhs.imag_ptr = nullptr;
+  }
+  
+  operator Complex() const { return Complex(*real_ptr, *imag_ptr); }
+
+  
+  ComplexProxy &operator=(const Complex &c) {
+    *real_ptr = c.real();
+    *imag_ptr = c.imag();
+    return *this;
+  }
+};
+
+class ComplexArray {
+private:
+  double *real;
+  double *imag;
+  size_t array_size;
+
+public:
+  ComplexArray(size_t n) : array_size(n) {
+    if (n == 0) {
+      real = nullptr;
+      imag = nullptr;
+    } else {
+      real = new double[n];
+      std::fill(real, real + n, 0.0);
+      imag = new double[n];
+      std::fill(imag, imag + n, 0.0);
+    }
+  }
+
+  ~ComplexArray() {
+    if (real != nullptr) {
+      delete[] real;
+    }
+    if (imag != nullptr) {
+      delete[] imag;
+    }
+  }
+
+  ComplexArray(const ComplexArray &other) : array_size(other.array_size) {
+    real = new double[array_size];
+    imag = new double[array_size];
+    std::copy(other.real, other.real + array_size, real);
+    std::copy(other.imag, other.imag + array_size, imag);
+  }
+
+  ComplexArray(ComplexArray &&rhs) noexcept
+      : real(rhs.real), imag(rhs.imag), array_size(rhs.array_size) {
+    rhs.real = nullptr;
+    rhs.imag = nullptr;
+    rhs.array_size = 0;
+  }
+
+  ComplexArray &operator=(const ComplexArray &other) {
+    if (this == &other) {
+      return *this;
+    }
+    if (real != nullptr) {
+      delete[] real;
+    }
+    if (imag != nullptr) {
+      delete[] imag;
+    }
+
+    array_size = other.array_size;
+    real = new double[array_size];
+    imag = new double[array_size];
+
+    for (size_t i = 0; i < array_size; i++) {
+      real[i] = other.real[i];
+      imag[i] = other.imag[i];
+    }
+
+    return *this;
+  }
+
+  ComplexArray operator*(const ComplexArray &other) const {
+    if (array_size != other.array_size) {
+      throw std::invalid_argument("Array sizes do not match");
+    }
+
+    ComplexArray result(array_size);
+    for (size_t i = 0; i < array_size; i++) {
+      result.real[i] = real[i] * other.real[i] - imag[i] * other.imag[i];
+      result.imag[i] = real[i] * other.imag[i] + imag[i] * other.real[i];
+    }
+    return result;
+  }
+
+  ComplexArray operator+(const ComplexArray &other) const {
+    if (array_size != other.array_size) {
+      throw std::invalid_argument("Array sizes do not match");
+    }
+
+    ComplexArray result(array_size);
+    for (size_t i = 0; i < array_size; i++) {
+      result.real[i] = real[i] + other.real[i];
+      result.imag[i] = imag[i] + other.imag[i];
+    }
+    return result;
+  }
+
+  Complex operator[](size_t index) const {
+    return Complex(real[index], imag[index]);
+  }
+
+  ComplexProxy operator[](size_t index) {
+    return ComplexProxy(real + index, imag + index);
+  }
+
+  size_t size() const { return array_size; }
+
+  double *get_real_ptr() { return real; }
+
+  const double *get_real_ptr() const { return real; }
+
+  double *get_imag_ptr() { return imag; }
+
+  const double *get_imag_ptr() const { return imag; }
+};
+
+#endif
diff --git a/src/dct_naive.hpp b/src/dct_naive.hpp
new file mode 100644
index 0000000..ffbae89
--- /dev/null
+++ b/src/dct_naive.hpp
@@ -0,0 +1,72 @@
+#pragma once
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <cmath>
+
+
+namespace py = pybind11;
+#define _USE_MATH_DEFINES
+
+constexpr double SQRT2_1 = 0.7071067811865475;
+constexpr double ONE_SUB_SQRT2_1 = 1 - 0.7071067811865475;
+
+static inline double e(int n) { return 1.0f; }
+
+static inline double dct_naive_single(double *buf, size_t buf_size, size_t i) {
+  double sum = 0.0f;
+  for (size_t j = 0; j < buf_size; j++) {
+    // sum += buf[j] * cos((2 * j + 1) * i * M_PI / (2 * buf_size));
+    sum += buf[j] * cos(M_PI / buf_size * i * (j + 0.5));
+  }
+  return sum;
+}
+
+static inline double idct_naive_single(double *buf, size_t buf_size, size_t i) {
+  double sum = 0.0;
+  for (size_t j = 0; j < buf_size; j++) {
+    sum += buf[j] * cos(M_PI * i * (2 * j + 1.0) / (2 * buf_size));
+  }
+  return sum;
+}
+
+void dct_naive_inner(double *buf, size_t buf_size, double *out) {
+  for (int i = 0; i < buf_size; i++) {
+    out[i] = 2.0 * e(i) * dct_naive_single(buf, buf_size, i);
+  }
+}
+
+void idct_naive_inner(double *buf, size_t buf_size, double *out) {
+  for (int i = 0; i < buf_size; i++) {
+    out[i] = (buf[0] + 2 * idct_naive_single(buf, buf_size, i)) / buf_size;
+  }
+}
+
+py::array dct_naive(
+  py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
+  py::buffer_info buf_info = in_arr.request();
+  double *ptr = static_cast<double *>(buf_info.ptr);
+  size_t buf_size = buf_info.size;
+
+  py::array_t<double> out_arr(buf_info.shape);
+
+  double *out_buf = static_cast<double *>(out_arr.request(true).ptr);
+  dct_naive_inner(ptr, buf_size, out_buf);
+
+  return out_arr;
+}
+
+py::array idct_naive(
+  py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
+  py::buffer_info buf_info = in_arr.request();
+  double *ptr = static_cast<double *>(buf_info.ptr);
+  size_t buf_size = buf_info.size;
+
+  py::array_t<double> out_arr(buf_info.shape);
+
+  double *out_buf = static_cast<double *>(out_arr.request(true).ptr);
+  idct_naive_inner(ptr, buf_size, out_buf);
+
+  return out_arr;
+}
diff --git a/src/fdct.hpp b/src/fdct.hpp
index e83eaa5..afe7665 100644
--- a/src/fdct.hpp
+++ b/src/fdct.hpp
@@ -1,43 +1,25 @@
+#pragma once
+
 #include <cassert>
 #include <fftw3.h>
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include <iostream>
 #include <cmath>
 #include <complex>
 #include <cstddef>
 
 #include "FftComplex.hpp"
-#include "complex.hpp"
+#include "complex_array.hpp"
 #include "fft.hpp"
+#include "fdct_inner.hpp"
+
 
 namespace py = pybind11;
 #define _USE_MATH_DEFINES
 
-using data_t = double;
-
-constexpr data_t SQRT2_1 = 0.7071067811865475;
-constexpr data_t ONE_SUB_SQRT2_1 = 1 - 0.7071067811865475;
-
-static inline data_t e(int n) { return 1.0f; }
-
-ComplexArray reorderInput(const ComplexArray &x) {
-  size_t N = x.size();
-  ComplexArray u(N);
-  const double *x_real = x.get_real_ptr();
-  const double *x_imag = x.get_imag_ptr();
-  double *u_real = u.get_real_ptr();
-  double *u_imag = u.get_imag_ptr();
-
-  for (size_t n = 0; n < N / 2; ++n) {
-    u_real[n] = x_real[2 * n];
-    u_imag[n] = x_imag[2 * n];
-    u_real[N - n - 1] = x_real[2 * n + 1];
-    u_imag[N - n - 1] = x_imag[2 * n + 1];
-  }
-  return u;
-}
 
 std::vector<double> reorderOutput(const std::vector<double> &x) {
   size_t N = x.size();
@@ -50,38 +32,6 @@ std::vector<double> reorderOutput(const std::vector<double> &x) {
   return reordered;
 }
 
-// Function to compute the Fast Cosine Transform
-std::vector<double> fct_inner(ComplexArray &x) {
-  size_t N = x.size();
-  std::vector<double> dct(N);
-  auto u = reorderInput(x);
-  ComplexArray fft_in(N);
-  std::vector<std::complex<double> > vc;
-  // Fill FFT input array
-  for (size_t n = 0; n < N; ++n) {
-    vc.push_back(std::complex<double>(u.get_real_ptr()[n], u.get_imag_ptr()[n]));
-    fft_in.get_real_ptr()[n] = u.get_real_ptr()[n];
-    fft_in.get_imag_ptr()[n] = u.get_imag_ptr()[n];
-  }
-  
-  // Perform FFT
-  fft::transform(fft_in, false);
-  Fft::transform(vc, false);
-
-  for (int n = 0; n < N; ++n) {
-    std::cout << fft_in.get_real_ptr()[n] << " " << vc[n].real() << std::endl;
-  }
-  
-  // Compute the DCT using the FFT output
-  for (size_t k = 0; k < N; ++k) {
-    double real = fft_in[k].real();
-    double imag = fft_in[k].imag();
-    dct[k] =
-        (real * cos(M_PI * k / (2 * N)) - imag * sin(-M_PI * k / (2 * N))) * 2;
-  }
-  return dct;
-}
-
 // Wrapper function to accept and return NumPy arrays
 py::array
 fdct(py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
@@ -104,6 +54,8 @@ fdct(py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
   return py::array_t<double>(result.size(), result.data());
 }
 
+
+
 std::vector<double> idct_inner(const std::vector<double> &c) {
   size_t N = c.size();
   std::vector<double> x(N);
@@ -154,76 +106,15 @@ ifdct(py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
 
   // Convert input NumPy array to std::vector
   size_t N = buf.shape[0];
-  std::vector<double> input_data(static_cast<double *>(buf.ptr),
-                                 static_cast<double *>(buf.ptr) + N);
-
+  // std::vector<double> input_data(static_cast<double *>(buf.ptr),
+  //                               static_cast<double *>(buf.ptr) + N);
+  ComplexArray input_data(N);
+  std::copy(static_cast<double *>(buf.ptr), static_cast<double *>(buf.ptr) + N,
+	    input_data.get_real_ptr());
+  
   // Perform IDCT
-  std::vector<double> result = idct_inner(input_data);
-
+  std::vector<double> result = ifdct_inner(input_data);
+  std::cout << "Result: " << result[0] << std::endl;
   // Convert result back to NumPy array
   return py::array_t<double>(result.size(), result.data());
 }
-
-static inline data_t dct_naive_single(data_t *buf, size_t buf_size, size_t i) {
-  data_t sum = 0.0f;
-  for (size_t j = 0; j < buf_size; j++) {
-    // sum += buf[j] * cos((2 * j + 1) * i * M_PI / (2 * buf_size));
-    sum += buf[j] * cos(M_PI / buf_size * i * (j + 0.5));
-  }
-  return sum;
-}
-
-static inline data_t idct_naive_single(data_t *buf, size_t buf_size, size_t i) {
-  data_t sum = 0.0;
-  for (size_t j = 0; j < buf_size; j++) {
-    sum += buf[j] * cos(M_PI * i * (2 * j + 1.0) / (2 * buf_size));
-  }
-  return sum;
-}
-
-void dct_naive_inner(data_t *buf, size_t buf_size, data_t *out) {
-  for (int i = 0; i < buf_size; i++) {
-    out[i] = 2.0 * e(i) * dct_naive_single(buf, buf_size, i);
-  }
-}
-
-void idct_naive_inner(data_t *buf, size_t buf_size, data_t *out) {
-  for (int i = 0; i < buf_size; i++) {
-    out[i] = (buf[0] + 2 * idct_naive_single(buf, buf_size, i)) / buf_size;
-  }
-}
-
-double err(data_t *buf1, data_t *buf2, size_t buf_size) {
-  double sum = 0.0;
-  for (int i = 0; i < buf_size; i++)
-    sum += abs(buf1[i] - buf2[i]);
-  return sum;
-}
-
-py::array dct_naive(
-    py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
-  py::buffer_info buf_info = in_arr.request();
-  double *ptr = static_cast<double *>(buf_info.ptr);
-  size_t buf_size = buf_info.size;
-
-  py::array_t<double> out_arr(buf_info.shape);
-
-  double *out_buf = static_cast<double *>(out_arr.request(true).ptr);
-  dct_naive_inner(ptr, buf_size, out_buf);
-
-  return out_arr;
-}
-
-py::array idct_naive(
-    py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
-  py::buffer_info buf_info = in_arr.request();
-  double *ptr = static_cast<double *>(buf_info.ptr);
-  size_t buf_size = buf_info.size;
-
-  py::array_t<double> out_arr(buf_info.shape);
-
-  double *out_buf = static_cast<double *>(out_arr.request(true).ptr);
-  idct_naive_inner(ptr, buf_size, out_buf);
-
-  return out_arr;
-}
diff --git a/src/fdct_inner.cpp b/src/fdct_inner.cpp
new file mode 100644
index 0000000..02d007c
--- /dev/null
+++ b/src/fdct_inner.cpp
@@ -0,0 +1,83 @@
+
+#pragma once
+#include <iostream>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <vector>
+
+
+#include "fdct_inner.hpp"
+#include "complex_array.hpp"
+#include "fft.hpp"
+// Function to compute the Fast Cosine Transform
+
+ComplexArray reorderInput(const ComplexArray &x) {
+  size_t N = x.size();
+  ComplexArray u(N);
+  const double *x_real = x.get_real_ptr();
+  const double *x_imag = x.get_imag_ptr();
+  double *u_real = u.get_real_ptr();
+  double *u_imag = u.get_imag_ptr();
+
+  for (size_t n = 0; n < N / 2; ++n) {
+    u_real[n] = x_real[2 * n];
+    u_imag[n] = x_imag[2 * n];
+    u_real[N - n - 1] = x_real[2 * n + 1];
+    u_imag[N - n - 1] = x_imag[2 * n + 1];
+  }
+  return u;
+}
+
+std::vector<double> fct_inner(ComplexArray &x) {
+  size_t N = x.size();
+  std::vector<double> dct(N);
+  auto u = reorderInput(x);
+  ComplexArray fft_in(N);
+  std::vector<std::complex<double> > vc;
+  // Fill FFT input array
+  for (size_t n = 0; n < N; ++n) {
+    vc.push_back(std::complex<double>(u.get_real_ptr()[n], u.get_imag_ptr()[n]));
+    fft_in.get_real_ptr()[n] = u.get_real_ptr()[n];
+    fft_in.get_imag_ptr()[n] = u.get_imag_ptr()[n];
+  }
+  
+  // Perform FFT
+  fft::transform(fft_in, false);
+  
+  // Compute the DCT using the FFT output
+  for (size_t k = 0; k < N; ++k) {
+    double real = fft_in[k].real();
+    double imag = fft_in[k].imag();
+    dct[k] =
+        (real * cos(M_PI * k / (2 * N)) - imag * sin(-M_PI * k / (2 * N))) * 2;
+  }
+  return dct;
+}
+
+std::vector<double> ifdct_inner(ComplexArray &c) {
+  size_t N = c.size();
+  std::vector<double> x(N);
+  auto u = reorderInput(c);
+  ComplexArray fft_in(N);
+  std::vector<std::complex<double> > vc;
+  // Fill FFT input array
+  for (size_t n = 0; n < N; ++n) {
+    vc.push_back(std::complex<double>(u.get_real_ptr()[n], u.get_imag_ptr()[n]));
+    fft_in.get_real_ptr()[n] = u.get_real_ptr()[n];
+    fft_in.get_imag_ptr()[n] = u.get_imag_ptr()[n];
+  }
+  
+  // Perform FFT
+  fft::transform(fft_in, true);
+  
+  // Compute the IDCT using the FFT output
+  for (size_t k = 0; k < N; ++k) {
+    double real = fft_in[k].real();
+    double imag = fft_in[k].imag();
+    x[k] =
+	(real * cos(M_PI * k / (2.0 * N)) + imag * sin(M_PI * k / (2.0 * N))) /
+	N;
+  }
+  return x;
+}
\ No newline at end of file
diff --git a/src/fdct_inner.hpp b/src/fdct_inner.hpp
new file mode 100644
index 0000000..76e84f6
--- /dev/null
+++ b/src/fdct_inner.hpp
@@ -0,0 +1,17 @@
+
+#pragma once
+#include <iostream>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <vector>
+
+#include "complex_array.hpp"
+
+// Function to compute the Fast Cosine Transform
+
+ComplexArray reorderInput(const ComplexArray &x);
+
+std::vector<double> fct_inner(ComplexArray &x);
+
+std::vector<double> ifdct_inner(ComplexArray &c);
diff --git a/src/fdct_module.cpp b/src/fdct_module.cpp
index 9a0135a..38c5d1e 100644
--- a/src/fdct_module.cpp
+++ b/src/fdct_module.cpp
@@ -2,6 +2,7 @@
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
 #include "fdct.hpp"
+#include "dct_naive.hpp"
 
 namespace py = pybind11;
 #define _USE_MATH_DEFINES
@@ -10,6 +11,6 @@ namespace py = pybind11;
 PYBIND11_MODULE(fdct, m) {
   m.def("dct_naive", &dct_naive, "Compute fast discrete cosine transformation.");
   m.def("idct_naive", &idct_naive, "Compute inverse fast discrete cosine transformation.");
-  m.def("fdct", &fdct, "");
-  m.def("ifdct", &ifdct, "");
+  m.def("fdct", &fdct, "Compute fast discrete cosine transformation.");
+  m.def("ifdct", &ifdct, "Compute inverse fast discrete cosine transformation.");
 }
diff --git a/src/fft.cpp b/src/fft.cpp
index 40f0948..93bd45d 100644
--- a/src/fft.cpp
+++ b/src/fft.cpp
@@ -5,8 +5,9 @@
 #include <iostream>
 #include <stdexcept>
 #include <vector>
+#include <stdint.h>
 
-#include "complex.hpp"
+#include "complex_array.hpp"
 #include "fft.hpp"
 
 static size_t reverseBits(size_t val, int width);
@@ -14,21 +15,19 @@ static size_t reverseBits(size_t val, int width);
 void fft::transformRadix2(ComplexArray &x, bool inverse) {
   int levels = 0;
   size_t n = x.size();
-  double *real = x.get_imag_ptr();
-  double *imag = x.get_real_ptr();
-
+  double *real = x.get_real_ptr();
+  double *imag = x.get_imag_ptr();
+  
   for (size_t temp = n; temp > 1U; temp >>= 1)
     levels++;
   if (static_cast<size_t>(1U) << levels != n)
     throw std::domain_error("Length is not a power of 2");
 
-  double *real_expTable = new double[n / 2];
-  double *imag_expTable = new double[n / 2];
+  ComplexArray expTable(n / 2);
 
   for (size_t i = 0; i < n / 2; i++) {
     double angle = (inverse ? 2 : -2) * M_PI * i / n;
-    real_expTable[i] = cos(angle);
-    imag_expTable[i] = sin(angle);
+    expTable[i] = std::polar(1.0, angle);
   }
 
   for (size_t i = 0; i < n; i++) {
@@ -39,6 +38,9 @@ void fft::transformRadix2(ComplexArray &x, bool inverse) {
     }
   }
 
+  double *real_expTable = expTable.get_real_ptr();
+  double *imag_expTable = expTable.get_imag_ptr();
+
   for (size_t size = 2; size <= n; size *= 2) {
     size_t halfsize = size / 2;
     size_t tablestep = n / size;
@@ -61,8 +63,8 @@ void fft::transformRadix2(ComplexArray &x, bool inverse) {
 
 void fft::transformBluestein(ComplexArray &x, bool inverse) {
   size_t n = x.size();
-  double *real = x.get_imag_ptr();
-  double *imag = x.get_real_ptr();
+  double *real = x.get_real_ptr();
+  double *imag = x.get_imag_ptr();
   size_t m = 1;
   while (m / 2 <= n) {
     if (m > SIZE_MAX / 2)
@@ -71,32 +73,39 @@ void fft::transformBluestein(ComplexArray &x, bool inverse) {
   }
 
   // Trigonometric table
-  double *real_expTable = new double[n];
-  double *imag_expTable = new double[n];
+  ComplexArray expTable(m);
+  double *real_expTable = expTable.get_real_ptr();
+  double *imag_expTable = expTable.get_imag_ptr();
   for (size_t i = 0; i < n; i++) {
     uintmax_t temp = static_cast<uintmax_t>(i) * i;
     temp %= static_cast<uintmax_t>(n) * 2;
     double angle = (inverse ? M_PI : -M_PI) * temp / n;
-    real_expTable[i] = cos(angle);
-    imag_expTable[i] = sin(angle);
+    expTable[i] = std::polar(1.0, angle);
   }
+  ComplexArray a(m), b(m);
+  double *real_avec = a.get_real_ptr();
+  double *imag_avec = a.get_imag_ptr();
   
-  double *real_avec = new double[m];
-  double *imag_avec = new double[m];
   for (size_t i = 0; i < n; i++) {
     real_avec[i] = real[i] * real_expTable[i] - imag[i] * imag_expTable[i];
     imag_avec[i] = real[i] * imag_expTable[i] + imag[i] * real_expTable[i];
   }
-  double *real_bvec = new double[m];
-  double *imag_bvec = new double[m];
+  
+  // a = x * expTable;
+  
+  double *real_bvec = b.get_real_ptr();
+  double *imag_bvec = b.get_imag_ptr();
   real_bvec[0] = real_expTable[0];
   imag_bvec[0] = imag_expTable[0];
+  
   for (size_t i = 1; i < n; ++i) {
     real_bvec[i] = real_bvec[m - i] = real_expTable[i];
-    imag_bvec[i] = imag_bvec[m - i] = imag_expTable[i];
+    imag_bvec[i] = imag_bvec[m - i] = -imag_expTable[i];
   }
+  
+  ComplexArray c = convolove(a, b);
 
-  ComplexArray c = convolove(, y);
+  x = c * expTable;
 }
 
 ComplexArray fft::convolove(ComplexArray x, ComplexArray y) {
diff --git a/src/fft.hpp b/src/fft.hpp
index 8813af0..c4620fb 100644
--- a/src/fft.hpp
+++ b/src/fft.hpp
@@ -1,16 +1,15 @@
-#ifndef FFT_H
-#define FFT_H
+#pragma once
 
-#include "complex.hpp"
+#include "complex_array.hpp"
 
 namespace fft {
-    void transformRadix2(ComplexArray &x, bool inverse);
+void transformRadix2(ComplexArray &x, bool inverse);
 
-    void transformBluestein(ComplexArray &x, bool inverse);
+void transformBluestein(ComplexArray &x, bool inverse);
 
-    void transform(ComplexArray &x, bool inverse);
+void transform(ComplexArray &x, bool inverse);
 
-    ComplexArray convolove(const ComplexArray &x, const ComplexArray &y);
-}
+// ComplexArray convolove(const ComplexArray &x, const ComplexArray &y);
+ComplexArray convolove(ComplexArray x, ComplexArray y);
+} // namespace fft
 
-#endif  // FFT_H

From e29dcb4219a892f9075439957010e10896fed273 Mon Sep 17 00:00:00 2001
From: cfmc30 <cfmc.cs13@nycu.edu.tw>
Date: Wed, 11 Dec 2024 15:13:40 +0000
Subject: [PATCH 03/16] chore: new test

---
 test_dct.py | 47 ++++++++++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/test_dct.py b/test_dct.py
index 3ed77ee..98eeb4f 100644
--- a/test_dct.py
+++ b/test_dct.py
@@ -1,39 +1,36 @@
+import pytest
 import numpy as np
 import fdct
 import scipy.fft as fft
 
-def test_dct():
-    inp = np.random.rand(1024)
+# Parameters
+test_size_seq = [10, 64, 127, 256, 1023, 2048, 4097]
+tolerance = 1e-6  # Tolerance for individual element comparison
+
+@pytest.mark.parametrize("size", test_size_seq)
+def test_dct_naive(size):
+    inp = np.random.rand(size)
     gt = fft.dct(inp)
     ans = fdct.dct_naive(inp)
-    s = sum((gt - ans) ** 2) / 1024
-    assert s < 0.1
+    np.testing.assert_allclose(ans, gt, atol=tolerance, rtol=0)
 
-def test_idct():
-    inp = np.random.rand(1024)
+@pytest.mark.parametrize("size", test_size_seq)
+def test_idct_naive(size):
+    inp = np.random.rand(size)
     gt = fft.idct(inp)
     ans = fdct.idct_naive(inp)
-    
-    s = sum((gt - ans) ** 2) / 1024
-    assert s < 0.1
-    
-def test_fdct():
-    inp = np.random.rand(10)
+    np.testing.assert_allclose(ans, gt, atol=tolerance, rtol=0)
+
+@pytest.mark.parametrize("size", test_size_seq)
+def test_fdct(size):
+    inp = np.random.rand(size)
     gt = fft.dct(inp)
     ans = fdct.fdct(inp)
-    s = sum((gt - ans) ** 2) / 10
-    print(gt)
-    print(ans)
-    assert s < 0.1
+    np.testing.assert_allclose(ans, gt, atol=tolerance, rtol=0)
 
-def test_ifdct():
-    inp = np.random.rand(10)
+@pytest.mark.parametrize("size", test_size_seq)
+def test_ifdct(size):
+    inp = np.random.rand(size)
     gt = fft.idct(inp)
     ans = fdct.ifdct(inp)
-    s = sum((gt - ans) ** 2) / 10
-    assert s < 0.1
-    
-if __name__ == "__main__":
-    test_dct()
-    test_idct()
-    test_fdct()
+    np.testing.assert_allclose(ans, gt, atol=tolerance, rtol=0)

From b6cec8444471741b868d06153bb2a060f478c25e Mon Sep 17 00:00:00 2001
From: cfmc30 <cfmc.cs13@nycu.edu.tw>
Date: Wed, 11 Dec 2024 15:17:28 +0000
Subject: [PATCH 04/16] fix: missing file

---
 CMakeLists.txt | 12 +++++++++++-
 src/fdct.hpp   |  2 +-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 846483e..bfa4c97 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,17 @@ include_directories(${PYTHON_INCLUDE_DIRS})
 # Find Pybind11
 find_package(pybind11 REQUIRED)
 
-pybind11_add_module(fdct src/fdct.cpp)
+find_package(PkgConfig REQUIRED)
+pkg_search_module(FFTW REQUIRED fftw3 IMPORTED_TARGET)
+include_directories(PkgConfig::FFTW)
+link_libraries(PkgConfig::FFTW)
+
+pybind11_add_module(fdct 
+    src/fdct_module.cpp 
+    # src/FftComplex.cpp
+    src/fft.cpp
+    src/fdct_inner.cpp
+)
 
 # Link the module with Python
 target_link_libraries(fdct PRIVATE ${PYTHON_LIBRARIES})
diff --git a/src/fdct.hpp b/src/fdct.hpp
index afe7665..04ef415 100644
--- a/src/fdct.hpp
+++ b/src/fdct.hpp
@@ -11,7 +11,7 @@
 #include <complex>
 #include <cstddef>
 
-#include "FftComplex.hpp"
+// #include "FftComplex.hpp"
 #include "complex_array.hpp"
 #include "fft.hpp"
 #include "fdct_inner.hpp"

From e2ac5fc0a26115cbf71eda1ab9c1da60f473ebfa Mon Sep 17 00:00:00 2001
From: cfmc30 <cfmc.cs13@nycu.edu.tw>
Date: Wed, 11 Dec 2024 15:21:31 +0000
Subject: [PATCH 05/16] fix: CI dependency

---
 .github/workflows/cmake-single-platform.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cmake-single-platform.yml b/.github/workflows/cmake-single-platform.yml
index 5012e2a..ca27d04 100644
--- a/.github/workflows/cmake-single-platform.yml
+++ b/.github/workflows/cmake-single-platform.yml
@@ -29,7 +29,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        sudo apt install python3-pybind11
+        sudo apt install python3-pybind11 libfftw3-dev
         python -m pip install --upgrade pip
         python -m pip install pytest
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi

From 86047847ca8a3fc856207c0716cc197ea0611e10 Mon Sep 17 00:00:00 2001
From: cfmc30 <cfmc.cs13@nycu.edu.tw>
Date: Wed, 11 Dec 2024 16:09:48 +0000
Subject: [PATCH 06/16] fix: naive idct

---
 src/dct_naive.hpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/dct_naive.hpp b/src/dct_naive.hpp
index ffbae89..7764d41 100644
--- a/src/dct_naive.hpp
+++ b/src/dct_naive.hpp
@@ -12,12 +12,9 @@ namespace py = pybind11;
 constexpr double SQRT2_1 = 0.7071067811865475;
 constexpr double ONE_SUB_SQRT2_1 = 1 - 0.7071067811865475;
 
-static inline double e(int n) { return 1.0f; }
-
 static inline double dct_naive_single(double *buf, size_t buf_size, size_t i) {
   double sum = 0.0f;
   for (size_t j = 0; j < buf_size; j++) {
-    // sum += buf[j] * cos((2 * j + 1) * i * M_PI / (2 * buf_size));
     sum += buf[j] * cos(M_PI / buf_size * i * (j + 0.5));
   }
   return sum;
@@ -25,21 +22,21 @@ static inline double dct_naive_single(double *buf, size_t buf_size, size_t i) {
 
 static inline double idct_naive_single(double *buf, size_t buf_size, size_t i) {
   double sum = 0.0;
-  for (size_t j = 0; j < buf_size; j++) {
-    sum += buf[j] * cos(M_PI * i * (2 * j + 1.0) / (2 * buf_size));
+  for (size_t j = 1; j < buf_size; j++) {
+    sum += buf[j] * cos(M_PI * j * (2 * i + 1.0) / (2.0 * buf_size));
   }
   return sum;
 }
 
 void dct_naive_inner(double *buf, size_t buf_size, double *out) {
-  for (int i = 0; i < buf_size; i++) {
-    out[i] = 2.0 * e(i) * dct_naive_single(buf, buf_size, i);
+  for (size_t i = 0; i < buf_size; i++) {
+    out[i] = 2.0 * dct_naive_single(buf, buf_size, i);
   }
 }
 
 void idct_naive_inner(double *buf, size_t buf_size, double *out) {
-  for (int i = 0; i < buf_size; i++) {
-    out[i] = (buf[0] + 2 * idct_naive_single(buf, buf_size, i)) / buf_size;
+  for (size_t i = 0; i < buf_size; i++) {
+    out[i] = (buf[0] + (2 * idct_naive_single(buf, buf_size, i))) / (2.0 * buf_size);
   }
 }
 

From 039c4140747bf5c06ddffdf3325d96a6775a5514 Mon Sep 17 00:00:00 2001
From: cfmc30 <cfmc.cs13@nycu.edu.tw>
Date: Thu, 12 Dec 2024 08:00:14 +0000
Subject: [PATCH 07/16] chore: new gitignore

---
 .gitignore | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.gitignore b/.gitignore
index 9bf3746..374071e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,6 @@
 compile_commands.json
+build/
+.vscode/
+.cache/
+*.so
+

From 269f5cf5b2a0704c0611a29c11218b0376cb7d2c Mon Sep 17 00:00:00 2001
From: cfmc30 <cfmc.cs13@nycu.edu.tw>
Date: Thu, 12 Dec 2024 08:01:48 +0000
Subject: [PATCH 08/16] fix: correct fdct

---
 CMakeLists.txt             |   3 +-
 src/fdct.hpp               |  21 +---
 src/fdct_inner.cpp         | 223 ++++++++++++++++++++++++++-----------
 src/fft.cpp                |   7 +-
 src/ref/FastDctFft.hpp     |  36 ++++++
 src/ref/FastDctFtt.cpp     |  75 +++++++++++++
 src/ref/FftComplex.cpp     | 149 +++++++++++++++++++++++++
 src/ref/FftComplex.hpp     |  61 ++++++++++
 src/ref/FftComplexTest.cpp | 177 +++++++++++++++++++++++++++++
 src/ref/FftRealPair.cpp    | 195 ++++++++++++++++++++++++++++++++
 src/ref/FftRealPair.hpp    |  75 +++++++++++++
 src/ref/README.org         |   5 +
 12 files changed, 940 insertions(+), 87 deletions(-)
 create mode 100644 src/ref/FastDctFft.hpp
 create mode 100644 src/ref/FastDctFtt.cpp
 create mode 100644 src/ref/FftComplex.cpp
 create mode 100644 src/ref/FftComplex.hpp
 create mode 100644 src/ref/FftComplexTest.cpp
 create mode 100644 src/ref/FftRealPair.cpp
 create mode 100644 src/ref/FftRealPair.hpp
 create mode 100644 src/ref/README.org

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bfa4c97..4afb28a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,8 +14,7 @@ include_directories(PkgConfig::FFTW)
 link_libraries(PkgConfig::FFTW)
 
 pybind11_add_module(fdct 
-    src/fdct_module.cpp 
-    # src/FftComplex.cpp
+    src/fdct_module.cpp
     src/fft.cpp
     src/fdct_inner.cpp
 )
diff --git a/src/fdct.hpp b/src/fdct.hpp
index 04ef415..ee08687 100644
--- a/src/fdct.hpp
+++ b/src/fdct.hpp
@@ -20,18 +20,6 @@
 namespace py = pybind11;
 #define _USE_MATH_DEFINES
 
-
-std::vector<double> reorderOutput(const std::vector<double> &x) {
-  size_t N = x.size();
-  std::vector<double> reordered(N);
-
-  for (size_t n = 0; n < N / 2; ++n) {
-    reordered[2 * n] = x[n];
-    reordered[2 * n + 1] = x[N - n - 1];
-  }
-  return reordered;
-}
-
 // Wrapper function to accept and return NumPy arrays
 py::array
 fdct(py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
@@ -54,13 +42,11 @@ fdct(py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
   return py::array_t<double>(result.size(), result.data());
 }
 
-
-
 std::vector<double> idct_inner(const std::vector<double> &c) {
   size_t N = c.size();
   std::vector<double> x(N);
-  auto u = reorderOutput(c);
-
+  // auto u = reorderOutput(c);
+  ComplexArray u(N);
   // Allocate FFTW input/output arrays
   fftw_complex *fft_in = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
   fftw_complex *fft_out = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
@@ -71,7 +57,7 @@ std::vector<double> idct_inner(const std::vector<double> &c) {
 
   // Fill FFTW input array
   for (size_t n = 0; n < N; ++n) {
-    fft_in[n][0] = u[n]; // Real part
+    fft_in[n][0] = c[n]; // Real part
     fft_in[n][1] = 0.0;  // Imaginary part
   }
 
@@ -114,7 +100,6 @@ ifdct(py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
   
   // Perform IDCT
   std::vector<double> result = ifdct_inner(input_data);
-  std::cout << "Result: " << result[0] << std::endl;
   // Convert result back to NumPy array
   return py::array_t<double>(result.size(), result.data());
 }
diff --git a/src/fdct_inner.cpp b/src/fdct_inner.cpp
index 02d007c..bc609dd 100644
--- a/src/fdct_inner.cpp
+++ b/src/fdct_inner.cpp
@@ -1,83 +1,176 @@
 
 #pragma once
-#include <iostream>
+#include "fdct_inner.hpp"
+
+#include <fftw3.h>
+
 #include <cmath>
 #include <complex>
 #include <cstddef>
+#include <iostream>
 #include <vector>
 
-
-#include "fdct_inner.hpp"
 #include "complex_array.hpp"
 #include "fft.hpp"
+
 // Function to compute the Fast Cosine Transform
 
 ComplexArray reorderInput(const ComplexArray &x) {
-  size_t N = x.size();
-  ComplexArray u(N);
-  const double *x_real = x.get_real_ptr();
-  const double *x_imag = x.get_imag_ptr();
-  double *u_real = u.get_real_ptr();
-  double *u_imag = u.get_imag_ptr();
-
-  for (size_t n = 0; n < N / 2; ++n) {
-    u_real[n] = x_real[2 * n];
-    u_imag[n] = x_imag[2 * n];
-    u_real[N - n - 1] = x_real[2 * n + 1];
-    u_imag[N - n - 1] = x_imag[2 * n + 1];
-  }
-  return u;
+    size_t N = x.size();
+    ComplexArray u(N);
+    const double *x_real = x.get_real_ptr();
+    const double *x_imag = x.get_imag_ptr();
+    double *u_real = u.get_real_ptr();
+    double *u_imag = u.get_imag_ptr();
+
+    for (size_t n = 0; n < N / 2; ++n) {
+        u_real[n] = x_real[2 * n];
+        u_imag[n] = x_imag[2 * n];
+        u_real[N - n - 1] = x_real[2 * n + 1];
+        u_imag[N - n - 1] = x_imag[2 * n + 1];
+    }
+    if (N % 2 != 0) {
+        u_real[N / 2] = x_real[N - 1];
+        u_imag[N / 2] = x_imag[N - 1];
+    }
+    return u;
+}
+
+ComplexArray reorderOutput(const ComplexArray &v) {
+    size_t N = v.size();
+    ComplexArray x(N);
+
+    // Access the real and imaginary parts directly
+    const double *v_real = v.get_real_ptr();
+    const double *v_imag = v.get_imag_ptr();
+    double *x_real = x.get_real_ptr();
+    double *x_imag = x.get_imag_ptr();
+
+    // Reordering according to the formula
+    for (size_t n = 0; n < N / 2; ++n) {
+        x_real[2 * n] = v_real[n];
+        x_imag[2 * n] = v_imag[n];
+        x_real[2 * n + 1] = v_real[N - n - 1];
+        x_imag[2 * n + 1] = v_imag[N - n - 1];
+    }
+    if (N % 2 != 0) {
+        x_real[N - 1] = v_real[N / 2];
+        x_imag[N - 1] = v_imag[N / 2];
+    }
+
+    return x;
+}
+
+ComplexArray idctPreprocess(const ComplexArray &x) {
+    size_t N = x.size();
+    ComplexArray u(N);
+    ComplexArray v(N);
+    v = x;
+    v.get_real_ptr()[0] /= 2;
+    // for (size_t n = 0; n < N; ++n) {
+    //     // Compute the complex term
+    //     Complex W = std::polar(1.0, -M_PI * n / (2 * N));
+
+    //     // Compute u[n]
+    //     u[n] = 0.5 * (x[n] - x[N - n - 1] * Complex(0, 1)) * W;
+    // }
+    
+    for (size_t n = 0; n < N; ++n) {
+        double tmp = n * M_PI / (2 * N);
+        u.get_real_ptr()[n] = v[n].real() * cos(tmp);
+        u.get_imag_ptr()[n] = -v[n].real() * sin(tmp);
+    }
+    return u;
 }
 
+
 std::vector<double> fct_inner(ComplexArray &x) {
-  size_t N = x.size();
-  std::vector<double> dct(N);
-  auto u = reorderInput(x);
-  ComplexArray fft_in(N);
-  std::vector<std::complex<double> > vc;
-  // Fill FFT input array
-  for (size_t n = 0; n < N; ++n) {
-    vc.push_back(std::complex<double>(u.get_real_ptr()[n], u.get_imag_ptr()[n]));
-    fft_in.get_real_ptr()[n] = u.get_real_ptr()[n];
-    fft_in.get_imag_ptr()[n] = u.get_imag_ptr()[n];
-  }
-  
-  // Perform FFT
-  fft::transform(fft_in, false);
-  
-  // Compute the DCT using the FFT output
-  for (size_t k = 0; k < N; ++k) {
-    double real = fft_in[k].real();
-    double imag = fft_in[k].imag();
-    dct[k] =
-        (real * cos(M_PI * k / (2 * N)) - imag * sin(-M_PI * k / (2 * N))) * 2;
-  }
-  return dct;
+    size_t N = x.size();
+    std::vector<double> dct(N);
+    auto u = reorderInput(x);
+    ComplexArray fft_in(N);
+    std::vector<std::complex<double> > vc;
+    // Fill FFT input array
+    for (size_t n = 0; n < N; ++n) {
+        vc.push_back(
+            std::complex<double>(u.get_real_ptr()[n], u.get_imag_ptr()[n]));
+        fft_in.get_real_ptr()[n] = u.get_real_ptr()[n];
+        fft_in.get_imag_ptr()[n] = u.get_imag_ptr()[n];
+    }
+
+    // use fftw3
+    // fftw_complex *in, *out;
+    // fftw_plan p;
+    // in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
+    // out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
+    // for (size_t i = 0; i < N; i++) {
+    //   in[i][0] = fft_in.get_real_ptr()[i];
+    //   in[i][1] = fft_in.get_imag_ptr()[i];
+    // }
+    // p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
+    // fftw_execute(p);
+    // for (size_t i = 0; i < N; i++) {
+    //   fft_in.get_real_ptr()[i] = out[i][0];
+    //   fft_in.get_imag_ptr()[i] = out[i][1];
+    // }
+    // fftw_destroy_plan(p);
+    // fftw_free(in);
+    // fftw_free(out);
+
+    // Perform FFT
+    fft::transform(fft_in, false);
+    // std::vector<std::complex<double> > fft_in_p;
+    // for (size_t n = 0; n < N; ++n) {
+    //   fft_in_p.push_back(std::complex<double>(fft_in.get_real_ptr()[n],
+    //   fft_in.get_imag_ptr()[n]));
+    // }
+    // Fft::transform(fft_in_p, false);
+    // for (size_t n = 0; n < N; ++n) {
+    //   fft_in.get_real_ptr()[n] = fft_in_p[n].real();
+    //   fft_in.get_imag_ptr()[n] = fft_in_p[n].imag();
+    // }
+
+    // Compute the DCT using the FFT output
+    for (size_t k = 0; k < N; ++k) {
+        double real = fft_in[k].real();
+        double imag = fft_in[k].imag();
+        dct[k] =
+            (real * cos(M_PI * k / (2 * N)) - imag * sin(-M_PI * k / (2 * N))) *
+            2;
+    }
+    return dct;
 }
 
 std::vector<double> ifdct_inner(ComplexArray &c) {
-  size_t N = c.size();
-  std::vector<double> x(N);
-  auto u = reorderInput(c);
-  ComplexArray fft_in(N);
-  std::vector<std::complex<double> > vc;
-  // Fill FFT input array
-  for (size_t n = 0; n < N; ++n) {
-    vc.push_back(std::complex<double>(u.get_real_ptr()[n], u.get_imag_ptr()[n]));
-    fft_in.get_real_ptr()[n] = u.get_real_ptr()[n];
-    fft_in.get_imag_ptr()[n] = u.get_imag_ptr()[n];
-  }
-  
-  // Perform FFT
-  fft::transform(fft_in, true);
-  
-  // Compute the IDCT using the FFT output
-  for (size_t k = 0; k < N; ++k) {
-    double real = fft_in[k].real();
-    double imag = fft_in[k].imag();
-    x[k] =
-	(real * cos(M_PI * k / (2.0 * N)) + imag * sin(M_PI * k / (2.0 * N))) /
-	N;
-  }
-  return x;
-}
\ No newline at end of file
+    size_t N = c.size();
+    std::vector<double> x(N);
+    // auto u = reorderInput(c);
+    auto u = idctPreprocess(c);
+    ComplexArray fft_in(N);
+    std::vector<std::complex<double> > vc;
+    // Fill FFT input array
+    for (size_t n = 0; n < N; ++n) {
+        vc.push_back(
+            std::complex<double>(u.get_real_ptr()[n], u.get_imag_ptr()[n]));
+        fft_in.get_real_ptr()[n] = u.get_real_ptr()[n];
+        fft_in.get_imag_ptr()[n] = u.get_imag_ptr()[n];
+    }
+
+    // Perform FFT
+    fft::transform(fft_in, false);
+
+    auto x_ = reorderOutput(fft_in);
+    for (size_t n = 0; n < N; ++n) {
+        x[n] = x_.get_real_ptr()[n] / N;
+    }
+    
+    // Compute the IDCT using the FFT output
+    // for (size_t k = 0; k < N; ++k) {
+    //     double real = fft_in[k].real();
+    //     double imag = fft_in[k].imag();
+    //     x[k] = (real * cos(M_PI * k / (2.0 * N)) +
+    //             imag * sin(M_PI * k / (2.0 * N))) /
+    //            N;
+    // }
+    return x;
+}
diff --git a/src/fft.cpp b/src/fft.cpp
index 93bd45d..692d3f3 100644
--- a/src/fft.cpp
+++ b/src/fft.cpp
@@ -73,7 +73,7 @@ void fft::transformBluestein(ComplexArray &x, bool inverse) {
   }
 
   // Trigonometric table
-  ComplexArray expTable(m);
+  ComplexArray expTable(n);
   double *real_expTable = expTable.get_real_ptr();
   double *imag_expTable = expTable.get_imag_ptr();
   for (size_t i = 0; i < n; i++) {
@@ -105,7 +105,10 @@ void fft::transformBluestein(ComplexArray &x, bool inverse) {
   
   ComplexArray c = convolove(a, b);
 
-  x = c * expTable;
+  for (size_t i = 0; i < n; i++) {
+    real[i] = c.get_real_ptr()[i] * real_expTable[i] - c.get_imag_ptr()[i] * imag_expTable[i];
+    imag[i] = c.get_real_ptr()[i] * imag_expTable[i] + c.get_imag_ptr()[i] * real_expTable[i];
+  }
 }
 
 ComplexArray fft::convolove(ComplexArray x, ComplexArray y) {
diff --git a/src/ref/FastDctFft.hpp b/src/ref/FastDctFft.hpp
new file mode 100644
index 0000000..624234d
--- /dev/null
+++ b/src/ref/FastDctFft.hpp
@@ -0,0 +1,36 @@
+/* 
+ * Fast discrete cosine transform algorithms (C++)
+ * 
+ * Copyright (c) 2017 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#pragma once
+
+#include <vector>
+
+
+namespace FastDctFft {
+	
+	void transform(std::vector<double> &vec);
+	
+	void inverseTransform(std::vector<double> &vec);
+	
+}
+
diff --git a/src/ref/FastDctFtt.cpp b/src/ref/FastDctFtt.cpp
new file mode 100644
index 0000000..5acfa96
--- /dev/null
+++ b/src/ref/FastDctFtt.cpp
@@ -0,0 +1,75 @@
+/* 
+ * Fast discrete cosine transform algorithms (C++)
+ * 
+ * Copyright (c) 2017 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include "FastDctFft.hpp"
+#include "FftRealPair.hpp"
+
+using std::size_t;
+using std::vector;
+
+
+// DCT type II, unscaled
+void FastDctFft::transform(vector<double> &vec) {
+	size_t len = vec.size();
+	size_t halfLen = len / 2;
+	vector<double> real(len);
+	for (size_t i = 0; i < halfLen; i++) {
+		real.at(i) = vec.at(i * 2);
+		real.at(len - 1 - i) = vec.at(i * 2 + 1);
+	}
+	if (len % 2 == 1)
+		real.at(halfLen) = vec.at(len - 1);
+	std::fill(vec.begin(), vec.end(), 0.0);
+	Fft::transform(real, vec);
+	for (size_t i = 0; i < len; i++) {
+		double temp = i * M_PI / (len * 2);
+		vec.at(i) = real.at(i) * std::cos(temp) + vec.at(i) * std::sin(temp);
+	}
+}
+
+
+// DCT type III, unscaled
+void FastDctFft::inverseTransform(vector<double> &vec) {
+	size_t len = vec.size();
+	if (len > 0)
+		vec.at(0) /= 2;
+	vector<double> real(len);
+	for (size_t i = 0; i < len; i++) {
+		double temp = i * M_PI / (len * 2);
+		real.at(i) = vec.at(i) * std::cos(temp);
+		vec.at(i) *= -std::sin(temp);
+	}
+	Fft::transform(real, vec);
+	
+	size_t halfLen = len / 2;
+	for (size_t i = 0; i < halfLen; i++) {
+		vec.at(i * 2 + 0) = real.at(i);
+		vec.at(i * 2 + 1) = real.at(len - 1 - i);
+	}
+	if (len % 2 == 1)
+		vec.at(len - 1) = real.at(halfLen);
+}
+
diff --git a/src/ref/FftComplex.cpp b/src/ref/FftComplex.cpp
new file mode 100644
index 0000000..4f842a0
--- /dev/null
+++ b/src/ref/FftComplex.cpp
@@ -0,0 +1,149 @@
+/* 
+ * Free FFT and convolution (C++)
+ * 
+ * Copyright (c) 2021 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <utility>
+#include "FftComplex.hpp"
+
+using std::complex;
+using std::size_t;
+using std::uintmax_t;
+using std::vector;
+
+
+// Private function prototypes
+static size_t reverseBits(size_t val, int width);
+
+
+void Fft::transform(vector<complex<double> > &vec, bool inverse) {
+	size_t n = vec.size();
+	if (n == 0)
+		return;
+	else if ((n & (n - 1)) == 0)  // Is power of 2
+		transformRadix2(vec, inverse);
+	else  // More complicated algorithm for arbitrary sizes
+		transformBluestein(vec, inverse);
+}
+
+
+void Fft::transformRadix2(vector<complex<double> > &vec, bool inverse) {
+	// Length variables
+	size_t n = vec.size();
+	int levels = 0;  // Compute levels = floor(log2(n))
+	for (size_t temp = n; temp > 1U; temp >>= 1)
+		levels++;
+	if (static_cast<size_t>(1U) << levels != n)
+		throw std::domain_error("Length is not a power of 2");
+	
+	// Trigonometric table
+	vector<complex<double> > expTable(n / 2);
+	for (size_t i = 0; i < n / 2; i++)
+		expTable[i] = std::polar(1.0, (inverse ? 2 : -2) * M_PI * i / n);
+	
+	// Bit-reversed addressing permutation
+	for (size_t i = 0; i < n; i++) {
+		size_t j = reverseBits(i, levels);
+		if (j > i)
+			std::swap(vec[i], vec[j]);
+	}
+	
+	// Cooley-Tukey decimation-in-time radix-2 FFT
+	for (size_t size = 2; size <= n; size *= 2) {
+		size_t halfsize = size / 2;
+		size_t tablestep = n / size;
+		for (size_t i = 0; i < n; i += size) {
+			for (size_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) {
+				complex<double> temp = vec[j + halfsize] * expTable[k];
+				vec[j + halfsize] = vec[j] - temp;
+				vec[j] += temp;
+			}
+		}
+		if (size == n)  // Prevent overflow in 'size *= 2'
+			break;
+	}
+}
+
+
+void Fft::transformBluestein(vector<complex<double> > &vec, bool inverse) {
+	// Find a power-of-2 convolution length m such that m >= n * 2 + 1
+	size_t n = vec.size();
+	size_t m = 1;
+	while (m / 2 <= n) {
+		if (m > SIZE_MAX / 2)
+			throw std::length_error("Vector too large");
+		m *= 2;
+	}
+	
+	// Trigonometric table
+	vector<complex<double> > expTable(n);
+	for (size_t i = 0; i < n; i++) {
+		uintmax_t temp = static_cast<uintmax_t>(i) * i;
+		temp %= static_cast<uintmax_t>(n) * 2;
+		double angle = (inverse ? M_PI : -M_PI) * temp / n;
+		expTable[i] = std::polar(1.0, angle);
+	}
+	
+	// Temporary vectors and preprocessing
+	vector<complex<double> > avec(m);
+	for (size_t i = 0; i < n; i++)
+		avec[i] = vec[i] * expTable[i];
+	vector<complex<double> > bvec(m);
+	bvec[0] = expTable[0];
+	for (size_t i = 1; i < n; i++)
+		bvec[i] = bvec[m - i] = std::conj(expTable[i]);
+	
+	// Convolution
+	vector<complex<double> > cvec = convolve(std::move(avec), std::move(bvec));
+	
+	// Postprocessing
+	for (size_t i = 0; i < n; i++)
+		vec[i] = cvec[i] * expTable[i];
+}
+
+
+vector<complex<double> > Fft::convolve(
+		vector<complex<double> > xvec,
+		vector<complex<double> > yvec) {
+	
+	size_t n = xvec.size();
+	if (n != yvec.size())
+		throw std::domain_error("Mismatched lengths");
+	transform(xvec, false);
+	transform(yvec, false);
+	for (size_t i = 0; i < n; i++)
+		xvec[i] *= yvec[i];
+	transform(xvec, true);
+	for (size_t i = 0; i < n; i++)  // Scaling (because this FFT implementation omits it)
+		xvec[i] /= static_cast<double>(n);
+	return xvec;
+}
+
+
+static size_t reverseBits(size_t val, int width) {
+	size_t result = 0;
+	for (int i = 0; i < width; i++, val >>= 1)
+		result = (result << 1) | (val & 1U);
+	return result;
+}
diff --git a/src/ref/FftComplex.hpp b/src/ref/FftComplex.hpp
new file mode 100644
index 0000000..24eeabb
--- /dev/null
+++ b/src/ref/FftComplex.hpp
@@ -0,0 +1,61 @@
+/* 
+ * Free FFT and convolution (C++)
+ * 
+ * Copyright (c) 2021 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#pragma once
+
+#include <complex>
+#include <vector>
+using Complex = std::complex<double>;
+
+namespace Fft {
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector can have any length. This is a wrapper function. The inverse transform does not perform scaling, so it is not a true inverse.
+	 */
+	void transform(std::vector<std::complex<double> > &vec, bool inverse);
+	
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector's length must be a power of 2. Uses the Cooley-Tukey decimation-in-time radix-2 algorithm.
+	 */
+	void transformRadix2(std::vector<std::complex<double> > &vec, bool inverse);
+	
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector can have any length. This requires the convolution function, which in turn requires the radix-2 FFT function.
+	 * Uses Bluestein's chirp z-transform algorithm.
+	 */
+	void transformBluestein(std::vector<std::complex<double> > &vec, bool inverse);
+	
+	
+	/* 
+	 * Computes the circular convolution of the given complex vectors. Each vector's length must be the same.
+	 */
+	std::vector<std::complex<double> > convolve(
+		std::vector<std::complex<double> > xvec,
+		std::vector<std::complex<double> > yvec);
+	
+}
diff --git a/src/ref/FftComplexTest.cpp b/src/ref/FftComplexTest.cpp
new file mode 100644
index 0000000..dd960d1
--- /dev/null
+++ b/src/ref/FftComplexTest.cpp
@@ -0,0 +1,177 @@
+/* 
+ * FFT and convolution test (C++)
+ * 
+ * Copyright (c) 2021 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <utility>
+#include <vector>
+#include "FftComplex.hpp"
+
+using std::complex;
+using std::cout;
+using std::endl;
+using std::vector;
+
+
+// Private function prototypes
+static void testFft(int n);
+static void testConvolution(int n);
+static vector<complex<double> > naiveDft(const vector<complex<double> > &input, bool inverse);
+static vector<complex<double> > naiveConvolve(const vector<complex<double> > &xvec, const vector<complex<double> > &yvec);
+static double log10RmsErr(const vector<complex<double> > &xvec, const vector<complex<double> > &yvec);
+static vector<complex<double> > randomComplexes(int n);
+
+// Mutable global variable
+static double maxLogError = -INFINITY;
+
+// Random number generation
+std::default_random_engine randGen((std::random_device())());
+
+
+/*---- Main and test functions ----*/
+
+int main() {
+	// Test power-of-2 size FFTs
+	for (int i = 0; i <= 12; i++)
+		testFft(1 << i);
+	
+	// Test small size FFTs
+	for (int i = 0; i < 30; i++)
+		testFft(i);
+	
+	// Test diverse size FFTs
+	for (int i = 0, prev = 0; i <= 100; i++) {
+		int n = static_cast<int>(std::lround(std::pow(1500.0, i / 100.0)));
+		if (n > prev) {
+			testFft(n);
+			prev = n;
+		}
+	}
+	
+	// Test power-of-2 size convolutions
+	for (int i = 0; i <= 12; i++)
+		testConvolution(1 << i);
+	
+	// Test diverse size convolutions
+	for (int i = 0, prev = 0; i <= 100; i++) {
+		int n = static_cast<int>(std::lround(std::pow(1500.0, i / 100.0)));
+		if (n > prev) {
+			testConvolution(n);
+			prev = n;
+		}
+	}
+	
+	cout << endl;
+	cout << "Max log err = " << std::setprecision(3) << maxLogError << endl;
+	cout << "Test " << (maxLogError < -10 ? "passed" : "failed") << endl;
+	return EXIT_SUCCESS;
+}
+
+
+static void testFft(int n) {
+	const vector<complex<double> > input = randomComplexes(n);
+	const vector<complex<double> > expect = naiveDft(input, false);
+	vector<complex<double> > actual = input;
+	Fft::transform(actual, false);
+	double err = log10RmsErr(expect, actual);
+	
+	for (auto it = actual.begin(); it != actual.end(); ++it)
+		*it /= n;
+	Fft::transform(actual, true);
+	err = std::max(log10RmsErr(input, actual), err);
+	cout << "fftsize=" << std::setw(4) << std::setfill(' ') << n << "  "
+	     << "logerr=" << std::setw(5) << std::setprecision(3) << std::setiosflags(std::ios::showpoint)
+	     << err << endl;
+}
+
+
+static void testConvolution(int n) {
+	const vector<complex<double> > input0 = randomComplexes(n);
+	const vector<complex<double> > input1 = randomComplexes(n);
+	const vector<complex<double> > expect = naiveConvolve(input0, input1);
+	const vector<complex<double> > actual = Fft::convolve(std::move(input0), std::move(input1));
+	cout << "convsize=" << std::setw(4) << std::setfill(' ') << n << "  "
+	     << "logerr=" << std::setw(5) << std::setprecision(3) << std::setiosflags(std::ios::showpoint)
+	     << log10RmsErr(expect, actual) << endl;
+}
+
+
+/*---- Naive reference computation functions ----*/
+
+static vector<complex<double> > naiveDft(const vector<complex<double> > &input, bool inverse) {
+	int n = static_cast<int>(input.size());
+	vector<complex<double> > output;
+	double coef = (inverse ? 2 : -2) * M_PI / n;
+	for (int k = 0; k < n; k++) {  // For each output element
+		complex<double> sum(0);
+		for (int t = 0; t < n; t++) {  // For each input element
+			double angle = coef * (static_cast<long long>(t) * k % n);
+			sum += input[t] * std::polar(1.0, angle);
+		}
+		output.push_back(sum);
+	}
+	return output;
+}
+
+
+static vector<complex<double> > naiveConvolve(
+		const vector<complex<double> > &xvec, const vector<complex<double> > &yvec) {
+	int n = static_cast<int>(xvec.size());
+	vector<complex<double> > result(n);  // All zeros
+	for (int i = 0; i < n; i++) {
+		for (int j = 0; j < n; j++) {
+			int k = (i + j) % n;
+			result[k] += xvec[i] * yvec[j];
+		}
+	}
+	return result;
+}
+
+
+/*---- Utility functions ----*/
+
+static double log10RmsErr(const vector<complex<double> > &xvec, const vector<complex<double> > &yvec) {
+	int n = static_cast<int>(xvec.size());
+	double err = std::pow(10, -99 * 2);
+	for (int i = 0; i < n; i++)
+		err += std::norm(xvec.at(i) - yvec.at(i));
+	err /= n > 0 ? n : 1;
+	err = std::sqrt(err);  // Now this is a root mean square (RMS) error
+	err = std::log10(err);
+	maxLogError = std::max(err, maxLogError);
+	return err;
+}
+
+
+static vector<complex<double> > randomComplexes(int n) {
+	std::uniform_real_distribution<double> valueDist(-1.0, 1.0);
+	vector<complex<double> > result;
+	for (int i = 0; i < n; i++)
+		result.push_back(complex<double>(valueDist(randGen), valueDist(randGen)));
+	return result;
+}
diff --git a/src/ref/FftRealPair.cpp b/src/ref/FftRealPair.cpp
new file mode 100644
index 0000000..9bb414f
--- /dev/null
+++ b/src/ref/FftRealPair.cpp
@@ -0,0 +1,195 @@
+/* 
+ * Free FFT and convolution (C++)
+ * 
+ * Copyright (c) 2021 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include "FftRealPair.hpp"
+
+using std::size_t;
+using std::uintmax_t;
+using std::vector;
+
+
+// Private function prototypes
+static size_t reverseBits(size_t val, int width);
+
+
+void Fft::transform(vector<double> &real, vector<double> &imag) {
+	size_t n = real.size();
+	if (n != imag.size())
+		throw std::invalid_argument("Mismatched lengths");
+	if (n == 0)
+		return;
+	else if ((n & (n - 1)) == 0)  // Is power of 2
+		transformRadix2(real, imag);
+	else  // More complicated algorithm for arbitrary sizes
+		transformBluestein(real, imag);
+}
+
+
+void Fft::inverseTransform(vector<double> &real, vector<double> &imag) {
+	transform(imag, real);
+}
+
+
+void Fft::transformRadix2(vector<double> &real, vector<double> &imag) {
+	// Length variables
+	size_t n = real.size();
+	if (n != imag.size())
+		throw std::invalid_argument("Mismatched lengths");
+	int levels = 0;  // Compute levels = floor(log2(n))
+	for (size_t temp = n; temp > 1U; temp >>= 1)
+		levels++;
+	if (static_cast<size_t>(1U) << levels != n)
+		throw std::domain_error("Length is not a power of 2");
+	
+	// Trigonometric tables
+	vector<double> cosTable(n / 2);
+	vector<double> sinTable(n / 2);
+	for (size_t i = 0; i < n / 2; i++) {
+		cosTable[i] = std::cos(2 * M_PI * i / n);
+		sinTable[i] = std::sin(2 * M_PI * i / n);
+	}
+	
+	// Bit-reversed addressing permutation
+	for (size_t i = 0; i < n; i++) {
+		size_t j = reverseBits(i, levels);
+		if (j > i) {
+			std::swap(real[i], real[j]);
+			std::swap(imag[i], imag[j]);
+		}
+	}
+	
+	// Cooley-Tukey decimation-in-time radix-2 FFT
+	for (size_t size = 2; size <= n; size *= 2) {
+		size_t halfsize = size / 2;
+		size_t tablestep = n / size;
+		for (size_t i = 0; i < n; i += size) {
+			for (size_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) {
+				size_t l = j + halfsize;
+				double tpre =  real[l] * cosTable[k] + imag[l] * sinTable[k];
+				double tpim = -real[l] * sinTable[k] + imag[l] * cosTable[k];
+				real[l] = real[j] - tpre;
+				imag[l] = imag[j] - tpim;
+				real[j] += tpre;
+				imag[j] += tpim;
+			}
+		}
+		if (size == n)  // Prevent overflow in 'size *= 2'
+			break;
+	}
+}
+
+
+void Fft::transformBluestein(vector<double> &real, vector<double> &imag) {
+	// Find a power-of-2 convolution length m such that m >= n * 2 + 1
+	size_t n = real.size();
+	if (n != imag.size())
+		throw std::invalid_argument("Mismatched lengths");
+	size_t m = 1;
+	while (m / 2 <= n) {
+		if (m > SIZE_MAX / 2)
+			throw std::length_error("Vector too large");
+		m *= 2;
+	}
+	
+	// Trigonometric tables
+	vector<double> cosTable(n), sinTable(n);
+	for (size_t i = 0; i < n; i++) {
+		uintmax_t temp = static_cast<uintmax_t>(i) * i;
+		temp %= static_cast<uintmax_t>(n) * 2;
+		double angle = M_PI * temp / n;
+		cosTable[i] = std::cos(angle);
+		sinTable[i] = std::sin(angle);
+	}
+	
+	// Temporary vectors and preprocessing
+	vector<double> areal(m), aimag(m);
+	for (size_t i = 0; i < n; i++) {
+		areal[i] =  real[i] * cosTable[i] + imag[i] * sinTable[i];
+		aimag[i] = -real[i] * sinTable[i] + imag[i] * cosTable[i];
+	}
+	vector<double> breal(m), bimag(m);
+	breal[0] = cosTable[0];
+	bimag[0] = sinTable[0];
+	for (size_t i = 1; i < n; i++) {
+		breal[i] = breal[m - i] = cosTable[i];
+		bimag[i] = bimag[m - i] = sinTable[i];
+	}
+	
+	// Convolution
+	std::pair<vector<double>, vector<double> > cvec = convolve(areal, aimag, breal, bimag);
+	vector<double> creal = std::move(cvec.first );
+	vector<double> cimag = std::move(cvec.second);
+	
+	// Postprocessing
+	for (size_t i = 0; i < n; i++) {
+		real[i] =  creal[i] * cosTable[i] + cimag[i] * sinTable[i];
+		imag[i] = -creal[i] * sinTable[i] + cimag[i] * cosTable[i];
+	}
+}
+
+
+vector<double> Fft::convolve(vector<double> xvec, vector<double> yvec) {
+	size_t n = xvec.size();
+	if (n != yvec.size())
+		throw std::invalid_argument("Mismatched lengths");
+	return convolve(std::move(xvec), vector<double>(n), std::move(yvec), vector<double>(n)).first;
+}
+
+
+std::pair<vector<double>, vector<double> > Fft::convolve(
+		vector<double> xreal, vector<double> ximag,
+		vector<double> yreal, vector<double> yimag) {
+	
+	size_t n = xreal.size();
+	if (n != ximag.size() || n != yreal.size() || n != yimag.size())
+		throw std::invalid_argument("Mismatched lengths");
+	
+	transform(xreal, ximag);
+	transform(yreal, yimag);
+	
+	for (size_t i = 0; i < n; i++) {
+		double temp = xreal[i] * yreal[i] - ximag[i] * yimag[i];
+		ximag[i] = ximag[i] * yreal[i] + xreal[i] * yimag[i];
+		xreal[i] = temp;
+	}
+	inverseTransform(xreal, ximag);
+	
+	for (size_t i = 0; i < n; i++) {  // Scaling (because this FFT implementation omits it)
+		xreal[i] /= n;
+		ximag[i] /= n;
+	}
+	return std::pair<vector<double>, vector<double> >(std::move(xreal), std::move(ximag));
+}
+
+
+static size_t reverseBits(size_t val, int width) {
+	size_t result = 0;
+	for (int i = 0; i < width; i++, val >>= 1)
+		result = (result << 1) | (val & 1U);
+	return result;
+}
+
diff --git a/src/ref/FftRealPair.hpp b/src/ref/FftRealPair.hpp
new file mode 100644
index 0000000..95db10a
--- /dev/null
+++ b/src/ref/FftRealPair.hpp
@@ -0,0 +1,75 @@
+/* 
+ * Free FFT and convolution (C++)
+ * 
+ * Copyright (c) 2021 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#pragma once
+
+#include <utility>
+#include <vector>
+
+
+namespace Fft {
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector can have any length. This is a wrapper function.
+	 */
+	void transform(std::vector<double> &real, std::vector<double> &imag);
+	
+	
+	/* 
+	 * Computes the inverse discrete Fourier transform (IDFT) of the given complex vector, storing the result back into the vector.
+	 * The vector can have any length. This is a wrapper function. This transform does not perform scaling, so the inverse is not a true inverse.
+	 */
+	void inverseTransform(std::vector<double> &real, std::vector<double> &imag);
+	
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector's length must be a power of 2. Uses the Cooley-Tukey decimation-in-time radix-2 algorithm.
+	 */
+	void transformRadix2(std::vector<double> &real, std::vector<double> &imag);
+	
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector can have any length. This requires the convolution function, which in turn requires the radix-2 FFT function.
+	 * Uses Bluestein's chirp z-transform algorithm.
+	 */
+	void transformBluestein(std::vector<double> &real, std::vector<double> &imag);
+	
+	
+	/* 
+	 * Computes the circular convolution of the given real vectors. Each vector's length must be the same.
+	 */
+	std::vector<double> convolve(std::vector<double> xvec, std::vector<double> yvec);
+	
+	
+	/* 
+	 * Computes the circular convolution of the given complex vectors. Each vector's length must be the same.
+	 */
+	std::pair<std::vector<double>, std::vector<double> > convolve(
+		std::vector<double> xreal, std::vector<double> ximag,
+		std::vector<double> yreal, std::vector<double> yimag);
+	
+}
+
diff --git a/src/ref/README.org b/src/ref/README.org
new file mode 100644
index 0000000..74fd6b9
--- /dev/null
+++ b/src/ref/README.org
@@ -0,0 +1,5 @@
+* Reference Implementation
+
+This reference implementation is provided by Nayuki. I greatly appreciate their contribution.
+
+For more details, please visit Nayuki's [[Website][https://www.nayuki.io/]]

From 2d71e4a036a5966c32d74b2c93b4d1423fde18b9 Mon Sep 17 00:00:00 2001
From: cfmc30 <cfmc.cs13@nycu.edu.tw>
Date: Thu, 12 Dec 2024 08:02:52 +0000
Subject: [PATCH 09/16] chore: better test

---
 test_dct.py | 37 +++++++++++++------------------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/test_dct.py b/test_dct.py
index 98eeb4f..40a233f 100644
--- a/test_dct.py
+++ b/test_dct.py
@@ -2,35 +2,24 @@
 import numpy as np
 import fdct
 import scipy.fft as fft
+import time
 
 # Parameters
-test_size_seq = [10, 64, 127, 256, 1023, 2048, 4097]
+test_size_seq = [10, 64, 127, 256, 1023, 2048, 4097, 8192, 8193]
 tolerance = 1e-6  # Tolerance for individual element comparison
 
-@pytest.mark.parametrize("size", test_size_seq)
-def test_dct_naive(size):
-    inp = np.random.rand(size)
-    gt = fft.dct(inp)
-    ans = fdct.dct_naive(inp)
-    np.testing.assert_allclose(ans, gt, atol=tolerance, rtol=0)
-
-@pytest.mark.parametrize("size", test_size_seq)
-def test_idct_naive(size):
-    inp = np.random.rand(size)
-    gt = fft.idct(inp)
-    ans = fdct.idct_naive(inp)
-    np.testing.assert_allclose(ans, gt, atol=tolerance, rtol=0)
-
-@pytest.mark.parametrize("size", test_size_seq)
-def test_fdct(size):
-    inp = np.random.rand(size)
-    gt = fft.dct(inp)
-    ans = fdct.fdct(inp)
-    np.testing.assert_allclose(ans, gt, atol=tolerance, rtol=0)
+# List of test functions and their corresponding references
+test_cases = [
+    ("dct_naive", fdct.dct_naive, fft.dct),
+    ("idct_naive", fdct.idct_naive, fft.idct),
+    ("fdct", fdct.fdct, fft.dct),
+    ("ifdct", fdct.ifdct, fft.idct),
+]
 
 @pytest.mark.parametrize("size", test_size_seq)
-def test_ifdct(size):
+@pytest.mark.parametrize("name, func, ref_func", test_cases)
+def test_transform(name, func, ref_func, size):
     inp = np.random.rand(size)
-    gt = fft.idct(inp)
-    ans = fdct.ifdct(inp)
+    gt = ref_func(inp)
+    ans = func(inp)
     np.testing.assert_allclose(ans, gt, atol=tolerance, rtol=0)

From 988c0d742f40d079baf844407a956661c4c060d8 Mon Sep 17 00:00:00 2001
From: cfmc30 <cfmc.cs13@nycu.edu.tw>
Date: Thu, 12 Dec 2024 16:03:05 +0000
Subject: [PATCH 10/16] feat: add reference impl

---
 ref/FastDctFft.hpp     |  36 ++++++++
 ref/FastDctFtt.cpp     |  75 ++++++++++++++++
 ref/FftComplex.cpp     | 149 +++++++++++++++++++++++++++++++
 ref/FftComplex.hpp     |  61 +++++++++++++
 ref/FftComplexTest.cpp | 177 +++++++++++++++++++++++++++++++++++++
 ref/FftRealPair.cpp    | 195 +++++++++++++++++++++++++++++++++++++++++
 ref/FftRealPair.hpp    |  75 ++++++++++++++++
 ref/README.org         |   5 ++
 8 files changed, 773 insertions(+)
 create mode 100644 ref/FastDctFft.hpp
 create mode 100644 ref/FastDctFtt.cpp
 create mode 100644 ref/FftComplex.cpp
 create mode 100644 ref/FftComplex.hpp
 create mode 100644 ref/FftComplexTest.cpp
 create mode 100644 ref/FftRealPair.cpp
 create mode 100644 ref/FftRealPair.hpp
 create mode 100644 ref/README.org

diff --git a/ref/FastDctFft.hpp b/ref/FastDctFft.hpp
new file mode 100644
index 0000000..624234d
--- /dev/null
+++ b/ref/FastDctFft.hpp
@@ -0,0 +1,36 @@
+/* 
+ * Fast discrete cosine transform algorithms (C++)
+ * 
+ * Copyright (c) 2017 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#pragma once
+
+#include <vector>
+
+
+namespace FastDctFft {
+	
+	void transform(std::vector<double> &vec);
+	
+	void inverseTransform(std::vector<double> &vec);
+	
+}
+
diff --git a/ref/FastDctFtt.cpp b/ref/FastDctFtt.cpp
new file mode 100644
index 0000000..5acfa96
--- /dev/null
+++ b/ref/FastDctFtt.cpp
@@ -0,0 +1,75 @@
+/* 
+ * Fast discrete cosine transform algorithms (C++)
+ * 
+ * Copyright (c) 2017 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include "FastDctFft.hpp"
+#include "FftRealPair.hpp"
+
+using std::size_t;
+using std::vector;
+
+
+// DCT type II, unscaled
+void FastDctFft::transform(vector<double> &vec) {
+	size_t len = vec.size();
+	size_t halfLen = len / 2;
+	vector<double> real(len);
+	for (size_t i = 0; i < halfLen; i++) {
+		real.at(i) = vec.at(i * 2);
+		real.at(len - 1 - i) = vec.at(i * 2 + 1);
+	}
+	if (len % 2 == 1)
+		real.at(halfLen) = vec.at(len - 1);
+	std::fill(vec.begin(), vec.end(), 0.0);
+	Fft::transform(real, vec);
+	for (size_t i = 0; i < len; i++) {
+		double temp = i * M_PI / (len * 2);
+		vec.at(i) = real.at(i) * std::cos(temp) + vec.at(i) * std::sin(temp);
+	}
+}
+
+
+// DCT type III, unscaled
+void FastDctFft::inverseTransform(vector<double> &vec) {
+	size_t len = vec.size();
+	if (len > 0)
+		vec.at(0) /= 2;
+	vector<double> real(len);
+	for (size_t i = 0; i < len; i++) {
+		double temp = i * M_PI / (len * 2);
+		real.at(i) = vec.at(i) * std::cos(temp);
+		vec.at(i) *= -std::sin(temp);
+	}
+	Fft::transform(real, vec);
+	
+	size_t halfLen = len / 2;
+	for (size_t i = 0; i < halfLen; i++) {
+		vec.at(i * 2 + 0) = real.at(i);
+		vec.at(i * 2 + 1) = real.at(len - 1 - i);
+	}
+	if (len % 2 == 1)
+		vec.at(len - 1) = real.at(halfLen);
+}
+
diff --git a/ref/FftComplex.cpp b/ref/FftComplex.cpp
new file mode 100644
index 0000000..4f842a0
--- /dev/null
+++ b/ref/FftComplex.cpp
@@ -0,0 +1,149 @@
+/* 
+ * Free FFT and convolution (C++)
+ * 
+ * Copyright (c) 2021 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <utility>
+#include "FftComplex.hpp"
+
+using std::complex;
+using std::size_t;
+using std::uintmax_t;
+using std::vector;
+
+
+// Private function prototypes
+static size_t reverseBits(size_t val, int width);
+
+
+void Fft::transform(vector<complex<double> > &vec, bool inverse) {
+	size_t n = vec.size();
+	if (n == 0)
+		return;
+	else if ((n & (n - 1)) == 0)  // Is power of 2
+		transformRadix2(vec, inverse);
+	else  // More complicated algorithm for arbitrary sizes
+		transformBluestein(vec, inverse);
+}
+
+
+void Fft::transformRadix2(vector<complex<double> > &vec, bool inverse) {
+	// Length variables
+	size_t n = vec.size();
+	int levels = 0;  // Compute levels = floor(log2(n))
+	for (size_t temp = n; temp > 1U; temp >>= 1)
+		levels++;
+	if (static_cast<size_t>(1U) << levels != n)
+		throw std::domain_error("Length is not a power of 2");
+	
+	// Trigonometric table
+	vector<complex<double> > expTable(n / 2);
+	for (size_t i = 0; i < n / 2; i++)
+		expTable[i] = std::polar(1.0, (inverse ? 2 : -2) * M_PI * i / n);
+	
+	// Bit-reversed addressing permutation
+	for (size_t i = 0; i < n; i++) {
+		size_t j = reverseBits(i, levels);
+		if (j > i)
+			std::swap(vec[i], vec[j]);
+	}
+	
+	// Cooley-Tukey decimation-in-time radix-2 FFT
+	for (size_t size = 2; size <= n; size *= 2) {
+		size_t halfsize = size / 2;
+		size_t tablestep = n / size;
+		for (size_t i = 0; i < n; i += size) {
+			for (size_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) {
+				complex<double> temp = vec[j + halfsize] * expTable[k];
+				vec[j + halfsize] = vec[j] - temp;
+				vec[j] += temp;
+			}
+		}
+		if (size == n)  // Prevent overflow in 'size *= 2'
+			break;
+	}
+}
+
+
+void Fft::transformBluestein(vector<complex<double> > &vec, bool inverse) {
+	// Find a power-of-2 convolution length m such that m >= n * 2 + 1
+	size_t n = vec.size();
+	size_t m = 1;
+	while (m / 2 <= n) {
+		if (m > SIZE_MAX / 2)
+			throw std::length_error("Vector too large");
+		m *= 2;
+	}
+	
+	// Trigonometric table
+	vector<complex<double> > expTable(n);
+	for (size_t i = 0; i < n; i++) {
+		uintmax_t temp = static_cast<uintmax_t>(i) * i;
+		temp %= static_cast<uintmax_t>(n) * 2;
+		double angle = (inverse ? M_PI : -M_PI) * temp / n;
+		expTable[i] = std::polar(1.0, angle);
+	}
+	
+	// Temporary vectors and preprocessing
+	vector<complex<double> > avec(m);
+	for (size_t i = 0; i < n; i++)
+		avec[i] = vec[i] * expTable[i];
+	vector<complex<double> > bvec(m);
+	bvec[0] = expTable[0];
+	for (size_t i = 1; i < n; i++)
+		bvec[i] = bvec[m - i] = std::conj(expTable[i]);
+	
+	// Convolution
+	vector<complex<double> > cvec = convolve(std::move(avec), std::move(bvec));
+	
+	// Postprocessing
+	for (size_t i = 0; i < n; i++)
+		vec[i] = cvec[i] * expTable[i];
+}
+
+
+vector<complex<double> > Fft::convolve(
+		vector<complex<double> > xvec,
+		vector<complex<double> > yvec) {
+	
+	size_t n = xvec.size();
+	if (n != yvec.size())
+		throw std::domain_error("Mismatched lengths");
+	transform(xvec, false);
+	transform(yvec, false);
+	for (size_t i = 0; i < n; i++)
+		xvec[i] *= yvec[i];
+	transform(xvec, true);
+	for (size_t i = 0; i < n; i++)  // Scaling (because this FFT implementation omits it)
+		xvec[i] /= static_cast<double>(n);
+	return xvec;
+}
+
+
+static size_t reverseBits(size_t val, int width) {
+	size_t result = 0;
+	for (int i = 0; i < width; i++, val >>= 1)
+		result = (result << 1) | (val & 1U);
+	return result;
+}
diff --git a/ref/FftComplex.hpp b/ref/FftComplex.hpp
new file mode 100644
index 0000000..24eeabb
--- /dev/null
+++ b/ref/FftComplex.hpp
@@ -0,0 +1,61 @@
+/* 
+ * Free FFT and convolution (C++)
+ * 
+ * Copyright (c) 2021 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#pragma once
+
+#include <complex>
+#include <vector>
+using Complex = std::complex<double>;
+
+namespace Fft {
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector can have any length. This is a wrapper function. The inverse transform does not perform scaling, so it is not a true inverse.
+	 */
+	void transform(std::vector<std::complex<double> > &vec, bool inverse);
+	
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector's length must be a power of 2. Uses the Cooley-Tukey decimation-in-time radix-2 algorithm.
+	 */
+	void transformRadix2(std::vector<std::complex<double> > &vec, bool inverse);
+	
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector can have any length. This requires the convolution function, which in turn requires the radix-2 FFT function.
+	 * Uses Bluestein's chirp z-transform algorithm.
+	 */
+	void transformBluestein(std::vector<std::complex<double> > &vec, bool inverse);
+	
+	
+	/* 
+	 * Computes the circular convolution of the given complex vectors. Each vector's length must be the same.
+	 */
+	std::vector<std::complex<double> > convolve(
+		std::vector<std::complex<double> > xvec,
+		std::vector<std::complex<double> > yvec);
+	
+}
diff --git a/ref/FftComplexTest.cpp b/ref/FftComplexTest.cpp
new file mode 100644
index 0000000..dd960d1
--- /dev/null
+++ b/ref/FftComplexTest.cpp
@@ -0,0 +1,177 @@
+/* 
+ * FFT and convolution test (C++)
+ * 
+ * Copyright (c) 2021 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <utility>
+#include <vector>
+#include "FftComplex.hpp"
+
+using std::complex;
+using std::cout;
+using std::endl;
+using std::vector;
+
+
+// Private function prototypes
+static void testFft(int n);
+static void testConvolution(int n);
+static vector<complex<double> > naiveDft(const vector<complex<double> > &input, bool inverse);
+static vector<complex<double> > naiveConvolve(const vector<complex<double> > &xvec, const vector<complex<double> > &yvec);
+static double log10RmsErr(const vector<complex<double> > &xvec, const vector<complex<double> > &yvec);
+static vector<complex<double> > randomComplexes(int n);
+
+// Mutable global variable
+static double maxLogError = -INFINITY;
+
+// Random number generation
+std::default_random_engine randGen((std::random_device())());
+
+
+/*---- Main and test functions ----*/
+
+int main() {
+	// Test power-of-2 size FFTs
+	for (int i = 0; i <= 12; i++)
+		testFft(1 << i);
+	
+	// Test small size FFTs
+	for (int i = 0; i < 30; i++)
+		testFft(i);
+	
+	// Test diverse size FFTs
+	for (int i = 0, prev = 0; i <= 100; i++) {
+		int n = static_cast<int>(std::lround(std::pow(1500.0, i / 100.0)));
+		if (n > prev) {
+			testFft(n);
+			prev = n;
+		}
+	}
+	
+	// Test power-of-2 size convolutions
+	for (int i = 0; i <= 12; i++)
+		testConvolution(1 << i);
+	
+	// Test diverse size convolutions
+	for (int i = 0, prev = 0; i <= 100; i++) {
+		int n = static_cast<int>(std::lround(std::pow(1500.0, i / 100.0)));
+		if (n > prev) {
+			testConvolution(n);
+			prev = n;
+		}
+	}
+	
+	cout << endl;
+	cout << "Max log err = " << std::setprecision(3) << maxLogError << endl;
+	cout << "Test " << (maxLogError < -10 ? "passed" : "failed") << endl;
+	return EXIT_SUCCESS;
+}
+
+
+static void testFft(int n) {
+	const vector<complex<double> > input = randomComplexes(n);
+	const vector<complex<double> > expect = naiveDft(input, false);
+	vector<complex<double> > actual = input;
+	Fft::transform(actual, false);
+	double err = log10RmsErr(expect, actual);
+	
+	for (auto it = actual.begin(); it != actual.end(); ++it)
+		*it /= n;
+	Fft::transform(actual, true);
+	err = std::max(log10RmsErr(input, actual), err);
+	cout << "fftsize=" << std::setw(4) << std::setfill(' ') << n << "  "
+	     << "logerr=" << std::setw(5) << std::setprecision(3) << std::setiosflags(std::ios::showpoint)
+	     << err << endl;
+}
+
+
+static void testConvolution(int n) {
+	const vector<complex<double> > input0 = randomComplexes(n);
+	const vector<complex<double> > input1 = randomComplexes(n);
+	const vector<complex<double> > expect = naiveConvolve(input0, input1);
+	const vector<complex<double> > actual = Fft::convolve(std::move(input0), std::move(input1));
+	cout << "convsize=" << std::setw(4) << std::setfill(' ') << n << "  "
+	     << "logerr=" << std::setw(5) << std::setprecision(3) << std::setiosflags(std::ios::showpoint)
+	     << log10RmsErr(expect, actual) << endl;
+}
+
+
+/*---- Naive reference computation functions ----*/
+
+static vector<complex<double> > naiveDft(const vector<complex<double> > &input, bool inverse) {
+	int n = static_cast<int>(input.size());
+	vector<complex<double> > output;
+	double coef = (inverse ? 2 : -2) * M_PI / n;
+	for (int k = 0; k < n; k++) {  // For each output element
+		complex<double> sum(0);
+		for (int t = 0; t < n; t++) {  // For each input element
+			double angle = coef * (static_cast<long long>(t) * k % n);
+			sum += input[t] * std::polar(1.0, angle);
+		}
+		output.push_back(sum);
+	}
+	return output;
+}
+
+
+static vector<complex<double> > naiveConvolve(
+		const vector<complex<double> > &xvec, const vector<complex<double> > &yvec) {
+	int n = static_cast<int>(xvec.size());
+	vector<complex<double> > result(n);  // All zeros
+	for (int i = 0; i < n; i++) {
+		for (int j = 0; j < n; j++) {
+			int k = (i + j) % n;
+			result[k] += xvec[i] * yvec[j];
+		}
+	}
+	return result;
+}
+
+
+/*---- Utility functions ----*/
+
+static double log10RmsErr(const vector<complex<double> > &xvec, const vector<complex<double> > &yvec) {
+	int n = static_cast<int>(xvec.size());
+	double err = std::pow(10, -99 * 2);
+	for (int i = 0; i < n; i++)
+		err += std::norm(xvec.at(i) - yvec.at(i));
+	err /= n > 0 ? n : 1;
+	err = std::sqrt(err);  // Now this is a root mean square (RMS) error
+	err = std::log10(err);
+	maxLogError = std::max(err, maxLogError);
+	return err;
+}
+
+
+static vector<complex<double> > randomComplexes(int n) {
+	std::uniform_real_distribution<double> valueDist(-1.0, 1.0);
+	vector<complex<double> > result;
+	for (int i = 0; i < n; i++)
+		result.push_back(complex<double>(valueDist(randGen), valueDist(randGen)));
+	return result;
+}
diff --git a/ref/FftRealPair.cpp b/ref/FftRealPair.cpp
new file mode 100644
index 0000000..9bb414f
--- /dev/null
+++ b/ref/FftRealPair.cpp
@@ -0,0 +1,195 @@
+/* 
+ * Free FFT and convolution (C++)
+ * 
+ * Copyright (c) 2021 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include "FftRealPair.hpp"
+
+using std::size_t;
+using std::uintmax_t;
+using std::vector;
+
+
+// Private function prototypes
+static size_t reverseBits(size_t val, int width);
+
+
+void Fft::transform(vector<double> &real, vector<double> &imag) {
+	size_t n = real.size();
+	if (n != imag.size())
+		throw std::invalid_argument("Mismatched lengths");
+	if (n == 0)
+		return;
+	else if ((n & (n - 1)) == 0)  // Is power of 2
+		transformRadix2(real, imag);
+	else  // More complicated algorithm for arbitrary sizes
+		transformBluestein(real, imag);
+}
+
+
+void Fft::inverseTransform(vector<double> &real, vector<double> &imag) {
+	transform(imag, real);
+}
+
+
+void Fft::transformRadix2(vector<double> &real, vector<double> &imag) {
+	// Length variables
+	size_t n = real.size();
+	if (n != imag.size())
+		throw std::invalid_argument("Mismatched lengths");
+	int levels = 0;  // Compute levels = floor(log2(n))
+	for (size_t temp = n; temp > 1U; temp >>= 1)
+		levels++;
+	if (static_cast<size_t>(1U) << levels != n)
+		throw std::domain_error("Length is not a power of 2");
+	
+	// Trigonometric tables
+	vector<double> cosTable(n / 2);
+	vector<double> sinTable(n / 2);
+	for (size_t i = 0; i < n / 2; i++) {
+		cosTable[i] = std::cos(2 * M_PI * i / n);
+		sinTable[i] = std::sin(2 * M_PI * i / n);
+	}
+	
+	// Bit-reversed addressing permutation
+	for (size_t i = 0; i < n; i++) {
+		size_t j = reverseBits(i, levels);
+		if (j > i) {
+			std::swap(real[i], real[j]);
+			std::swap(imag[i], imag[j]);
+		}
+	}
+	
+	// Cooley-Tukey decimation-in-time radix-2 FFT
+	for (size_t size = 2; size <= n; size *= 2) {
+		size_t halfsize = size / 2;
+		size_t tablestep = n / size;
+		for (size_t i = 0; i < n; i += size) {
+			for (size_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) {
+				size_t l = j + halfsize;
+				double tpre =  real[l] * cosTable[k] + imag[l] * sinTable[k];
+				double tpim = -real[l] * sinTable[k] + imag[l] * cosTable[k];
+				real[l] = real[j] - tpre;
+				imag[l] = imag[j] - tpim;
+				real[j] += tpre;
+				imag[j] += tpim;
+			}
+		}
+		if (size == n)  // Prevent overflow in 'size *= 2'
+			break;
+	}
+}
+
+
+void Fft::transformBluestein(vector<double> &real, vector<double> &imag) {
+	// Find a power-of-2 convolution length m such that m >= n * 2 + 1
+	size_t n = real.size();
+	if (n != imag.size())
+		throw std::invalid_argument("Mismatched lengths");
+	size_t m = 1;
+	while (m / 2 <= n) {
+		if (m > SIZE_MAX / 2)
+			throw std::length_error("Vector too large");
+		m *= 2;
+	}
+	
+	// Trigonometric tables
+	vector<double> cosTable(n), sinTable(n);
+	for (size_t i = 0; i < n; i++) {
+		uintmax_t temp = static_cast<uintmax_t>(i) * i;
+		temp %= static_cast<uintmax_t>(n) * 2;
+		double angle = M_PI * temp / n;
+		cosTable[i] = std::cos(angle);
+		sinTable[i] = std::sin(angle);
+	}
+	
+	// Temporary vectors and preprocessing
+	vector<double> areal(m), aimag(m);
+	for (size_t i = 0; i < n; i++) {
+		areal[i] =  real[i] * cosTable[i] + imag[i] * sinTable[i];
+		aimag[i] = -real[i] * sinTable[i] + imag[i] * cosTable[i];
+	}
+	vector<double> breal(m), bimag(m);
+	breal[0] = cosTable[0];
+	bimag[0] = sinTable[0];
+	for (size_t i = 1; i < n; i++) {
+		breal[i] = breal[m - i] = cosTable[i];
+		bimag[i] = bimag[m - i] = sinTable[i];
+	}
+	
+	// Convolution
+	std::pair<vector<double>, vector<double> > cvec = convolve(areal, aimag, breal, bimag);
+	vector<double> creal = std::move(cvec.first );
+	vector<double> cimag = std::move(cvec.second);
+	
+	// Postprocessing
+	for (size_t i = 0; i < n; i++) {
+		real[i] =  creal[i] * cosTable[i] + cimag[i] * sinTable[i];
+		imag[i] = -creal[i] * sinTable[i] + cimag[i] * cosTable[i];
+	}
+}
+
+
+vector<double> Fft::convolve(vector<double> xvec, vector<double> yvec) {
+	size_t n = xvec.size();
+	if (n != yvec.size())
+		throw std::invalid_argument("Mismatched lengths");
+	return convolve(std::move(xvec), vector<double>(n), std::move(yvec), vector<double>(n)).first;
+}
+
+
+std::pair<vector<double>, vector<double> > Fft::convolve(
+		vector<double> xreal, vector<double> ximag,
+		vector<double> yreal, vector<double> yimag) {
+	
+	size_t n = xreal.size();
+	if (n != ximag.size() || n != yreal.size() || n != yimag.size())
+		throw std::invalid_argument("Mismatched lengths");
+	
+	transform(xreal, ximag);
+	transform(yreal, yimag);
+	
+	for (size_t i = 0; i < n; i++) {
+		double temp = xreal[i] * yreal[i] - ximag[i] * yimag[i];
+		ximag[i] = ximag[i] * yreal[i] + xreal[i] * yimag[i];
+		xreal[i] = temp;
+	}
+	inverseTransform(xreal, ximag);
+	
+	for (size_t i = 0; i < n; i++) {  // Scaling (because this FFT implementation omits it)
+		xreal[i] /= n;
+		ximag[i] /= n;
+	}
+	return std::pair<vector<double>, vector<double> >(std::move(xreal), std::move(ximag));
+}
+
+
+static size_t reverseBits(size_t val, int width) {
+	size_t result = 0;
+	for (int i = 0; i < width; i++, val >>= 1)
+		result = (result << 1) | (val & 1U);
+	return result;
+}
+
diff --git a/ref/FftRealPair.hpp b/ref/FftRealPair.hpp
new file mode 100644
index 0000000..95db10a
--- /dev/null
+++ b/ref/FftRealPair.hpp
@@ -0,0 +1,75 @@
+/* 
+ * Free FFT and convolution (C++)
+ * 
+ * Copyright (c) 2021 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#pragma once
+
+#include <utility>
+#include <vector>
+
+
+namespace Fft {
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector can have any length. This is a wrapper function.
+	 */
+	void transform(std::vector<double> &real, std::vector<double> &imag);
+	
+	
+	/* 
+	 * Computes the inverse discrete Fourier transform (IDFT) of the given complex vector, storing the result back into the vector.
+	 * The vector can have any length. This is a wrapper function. This transform does not perform scaling, so the inverse is not a true inverse.
+	 */
+	void inverseTransform(std::vector<double> &real, std::vector<double> &imag);
+	
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector's length must be a power of 2. Uses the Cooley-Tukey decimation-in-time radix-2 algorithm.
+	 */
+	void transformRadix2(std::vector<double> &real, std::vector<double> &imag);
+	
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector can have any length. This requires the convolution function, which in turn requires the radix-2 FFT function.
+	 * Uses Bluestein's chirp z-transform algorithm.
+	 */
+	void transformBluestein(std::vector<double> &real, std::vector<double> &imag);
+	
+	
+	/* 
+	 * Computes the circular convolution of the given real vectors. Each vector's length must be the same.
+	 */
+	std::vector<double> convolve(std::vector<double> xvec, std::vector<double> yvec);
+	
+	
+	/* 
+	 * Computes the circular convolution of the given complex vectors. Each vector's length must be the same.
+	 */
+	std::pair<std::vector<double>, std::vector<double> > convolve(
+		std::vector<double> xreal, std::vector<double> ximag,
+		std::vector<double> yreal, std::vector<double> yimag);
+	
+}
+
diff --git a/ref/README.org b/ref/README.org
new file mode 100644
index 0000000..74fd6b9
--- /dev/null
+++ b/ref/README.org
@@ -0,0 +1,5 @@
+* Reference Implementation
+
+This reference implementation is provided by Nayuki. I greatly appreciate their contribution.
+
+For more details, please visit Nayuki's [[Website][https://www.nayuki.io/]]

From 921202c335076fca481f8c423f1bd1ecc607c86f Mon Sep 17 00:00:00 2001
From: cfmc30 <cfmc.cs13@nycu.edu.tw>
Date: Thu, 12 Dec 2024 16:15:32 +0000
Subject: [PATCH 11/16] chore: remove ref in src

---
 src/ref/FastDctFft.hpp     |  36 -------
 src/ref/FastDctFtt.cpp     |  75 --------------
 src/ref/FftComplex.cpp     | 149 ----------------------------
 src/ref/FftComplex.hpp     |  61 ------------
 src/ref/FftComplexTest.cpp | 177 ---------------------------------
 src/ref/FftRealPair.cpp    | 195 -------------------------------------
 src/ref/FftRealPair.hpp    |  75 --------------
 src/ref/README.org         |   5 -
 8 files changed, 773 deletions(-)
 delete mode 100644 src/ref/FastDctFft.hpp
 delete mode 100644 src/ref/FastDctFtt.cpp
 delete mode 100644 src/ref/FftComplex.cpp
 delete mode 100644 src/ref/FftComplex.hpp
 delete mode 100644 src/ref/FftComplexTest.cpp
 delete mode 100644 src/ref/FftRealPair.cpp
 delete mode 100644 src/ref/FftRealPair.hpp
 delete mode 100644 src/ref/README.org

diff --git a/src/ref/FastDctFft.hpp b/src/ref/FastDctFft.hpp
deleted file mode 100644
index 624234d..0000000
--- a/src/ref/FastDctFft.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* 
- * Fast discrete cosine transform algorithms (C++)
- * 
- * Copyright (c) 2017 Project Nayuki. (MIT License)
- * https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
- * the Software, and to permit persons to whom the Software is furnished to do so,
- * subject to the following conditions:
- * - The above copyright notice and this permission notice shall be included in
- *   all copies or substantial portions of the Software.
- * - The Software is provided "as is", without warranty of any kind, express or
- *   implied, including but not limited to the warranties of merchantability,
- *   fitness for a particular purpose and noninfringement. In no event shall the
- *   authors or copyright holders be liable for any claim, damages or other
- *   liability, whether in an action of contract, tort or otherwise, arising from,
- *   out of or in connection with the Software or the use or other dealings in the
- *   Software.
- */
-
-#pragma once
-
-#include <vector>
-
-
-namespace FastDctFft {
-	
-	void transform(std::vector<double> &vec);
-	
-	void inverseTransform(std::vector<double> &vec);
-	
-}
-
diff --git a/src/ref/FastDctFtt.cpp b/src/ref/FastDctFtt.cpp
deleted file mode 100644
index 5acfa96..0000000
--- a/src/ref/FastDctFtt.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* 
- * Fast discrete cosine transform algorithms (C++)
- * 
- * Copyright (c) 2017 Project Nayuki. (MIT License)
- * https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
- * the Software, and to permit persons to whom the Software is furnished to do so,
- * subject to the following conditions:
- * - The above copyright notice and this permission notice shall be included in
- *   all copies or substantial portions of the Software.
- * - The Software is provided "as is", without warranty of any kind, express or
- *   implied, including but not limited to the warranties of merchantability,
- *   fitness for a particular purpose and noninfringement. In no event shall the
- *   authors or copyright holders be liable for any claim, damages or other
- *   liability, whether in an action of contract, tort or otherwise, arising from,
- *   out of or in connection with the Software or the use or other dealings in the
- *   Software.
- */
-
-#include <algorithm>
-#include <cmath>
-#include <cstddef>
-#include "FastDctFft.hpp"
-#include "FftRealPair.hpp"
-
-using std::size_t;
-using std::vector;
-
-
-// DCT type II, unscaled
-void FastDctFft::transform(vector<double> &vec) {
-	size_t len = vec.size();
-	size_t halfLen = len / 2;
-	vector<double> real(len);
-	for (size_t i = 0; i < halfLen; i++) {
-		real.at(i) = vec.at(i * 2);
-		real.at(len - 1 - i) = vec.at(i * 2 + 1);
-	}
-	if (len % 2 == 1)
-		real.at(halfLen) = vec.at(len - 1);
-	std::fill(vec.begin(), vec.end(), 0.0);
-	Fft::transform(real, vec);
-	for (size_t i = 0; i < len; i++) {
-		double temp = i * M_PI / (len * 2);
-		vec.at(i) = real.at(i) * std::cos(temp) + vec.at(i) * std::sin(temp);
-	}
-}
-
-
-// DCT type III, unscaled
-void FastDctFft::inverseTransform(vector<double> &vec) {
-	size_t len = vec.size();
-	if (len > 0)
-		vec.at(0) /= 2;
-	vector<double> real(len);
-	for (size_t i = 0; i < len; i++) {
-		double temp = i * M_PI / (len * 2);
-		real.at(i) = vec.at(i) * std::cos(temp);
-		vec.at(i) *= -std::sin(temp);
-	}
-	Fft::transform(real, vec);
-	
-	size_t halfLen = len / 2;
-	for (size_t i = 0; i < halfLen; i++) {
-		vec.at(i * 2 + 0) = real.at(i);
-		vec.at(i * 2 + 1) = real.at(len - 1 - i);
-	}
-	if (len % 2 == 1)
-		vec.at(len - 1) = real.at(halfLen);
-}
-
diff --git a/src/ref/FftComplex.cpp b/src/ref/FftComplex.cpp
deleted file mode 100644
index 4f842a0..0000000
--- a/src/ref/FftComplex.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/* 
- * Free FFT and convolution (C++)
- * 
- * Copyright (c) 2021 Project Nayuki. (MIT License)
- * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
- * the Software, and to permit persons to whom the Software is furnished to do so,
- * subject to the following conditions:
- * - The above copyright notice and this permission notice shall be included in
- *   all copies or substantial portions of the Software.
- * - The Software is provided "as is", without warranty of any kind, express or
- *   implied, including but not limited to the warranties of merchantability,
- *   fitness for a particular purpose and noninfringement. In no event shall the
- *   authors or copyright holders be liable for any claim, damages or other
- *   liability, whether in an action of contract, tort or otherwise, arising from,
- *   out of or in connection with the Software or the use or other dealings in the
- *   Software.
- */
-
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <utility>
-#include "FftComplex.hpp"
-
-using std::complex;
-using std::size_t;
-using std::uintmax_t;
-using std::vector;
-
-
-// Private function prototypes
-static size_t reverseBits(size_t val, int width);
-
-
-void Fft::transform(vector<complex<double> > &vec, bool inverse) {
-	size_t n = vec.size();
-	if (n == 0)
-		return;
-	else if ((n & (n - 1)) == 0)  // Is power of 2
-		transformRadix2(vec, inverse);
-	else  // More complicated algorithm for arbitrary sizes
-		transformBluestein(vec, inverse);
-}
-
-
-void Fft::transformRadix2(vector<complex<double> > &vec, bool inverse) {
-	// Length variables
-	size_t n = vec.size();
-	int levels = 0;  // Compute levels = floor(log2(n))
-	for (size_t temp = n; temp > 1U; temp >>= 1)
-		levels++;
-	if (static_cast<size_t>(1U) << levels != n)
-		throw std::domain_error("Length is not a power of 2");
-	
-	// Trigonometric table
-	vector<complex<double> > expTable(n / 2);
-	for (size_t i = 0; i < n / 2; i++)
-		expTable[i] = std::polar(1.0, (inverse ? 2 : -2) * M_PI * i / n);
-	
-	// Bit-reversed addressing permutation
-	for (size_t i = 0; i < n; i++) {
-		size_t j = reverseBits(i, levels);
-		if (j > i)
-			std::swap(vec[i], vec[j]);
-	}
-	
-	// Cooley-Tukey decimation-in-time radix-2 FFT
-	for (size_t size = 2; size <= n; size *= 2) {
-		size_t halfsize = size / 2;
-		size_t tablestep = n / size;
-		for (size_t i = 0; i < n; i += size) {
-			for (size_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) {
-				complex<double> temp = vec[j + halfsize] * expTable[k];
-				vec[j + halfsize] = vec[j] - temp;
-				vec[j] += temp;
-			}
-		}
-		if (size == n)  // Prevent overflow in 'size *= 2'
-			break;
-	}
-}
-
-
-void Fft::transformBluestein(vector<complex<double> > &vec, bool inverse) {
-	// Find a power-of-2 convolution length m such that m >= n * 2 + 1
-	size_t n = vec.size();
-	size_t m = 1;
-	while (m / 2 <= n) {
-		if (m > SIZE_MAX / 2)
-			throw std::length_error("Vector too large");
-		m *= 2;
-	}
-	
-	// Trigonometric table
-	vector<complex<double> > expTable(n);
-	for (size_t i = 0; i < n; i++) {
-		uintmax_t temp = static_cast<uintmax_t>(i) * i;
-		temp %= static_cast<uintmax_t>(n) * 2;
-		double angle = (inverse ? M_PI : -M_PI) * temp / n;
-		expTable[i] = std::polar(1.0, angle);
-	}
-	
-	// Temporary vectors and preprocessing
-	vector<complex<double> > avec(m);
-	for (size_t i = 0; i < n; i++)
-		avec[i] = vec[i] * expTable[i];
-	vector<complex<double> > bvec(m);
-	bvec[0] = expTable[0];
-	for (size_t i = 1; i < n; i++)
-		bvec[i] = bvec[m - i] = std::conj(expTable[i]);
-	
-	// Convolution
-	vector<complex<double> > cvec = convolve(std::move(avec), std::move(bvec));
-	
-	// Postprocessing
-	for (size_t i = 0; i < n; i++)
-		vec[i] = cvec[i] * expTable[i];
-}
-
-
-vector<complex<double> > Fft::convolve(
-		vector<complex<double> > xvec,
-		vector<complex<double> > yvec) {
-	
-	size_t n = xvec.size();
-	if (n != yvec.size())
-		throw std::domain_error("Mismatched lengths");
-	transform(xvec, false);
-	transform(yvec, false);
-	for (size_t i = 0; i < n; i++)
-		xvec[i] *= yvec[i];
-	transform(xvec, true);
-	for (size_t i = 0; i < n; i++)  // Scaling (because this FFT implementation omits it)
-		xvec[i] /= static_cast<double>(n);
-	return xvec;
-}
-
-
-static size_t reverseBits(size_t val, int width) {
-	size_t result = 0;
-	for (int i = 0; i < width; i++, val >>= 1)
-		result = (result << 1) | (val & 1U);
-	return result;
-}
diff --git a/src/ref/FftComplex.hpp b/src/ref/FftComplex.hpp
deleted file mode 100644
index 24eeabb..0000000
--- a/src/ref/FftComplex.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* 
- * Free FFT and convolution (C++)
- * 
- * Copyright (c) 2021 Project Nayuki. (MIT License)
- * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
- * the Software, and to permit persons to whom the Software is furnished to do so,
- * subject to the following conditions:
- * - The above copyright notice and this permission notice shall be included in
- *   all copies or substantial portions of the Software.
- * - The Software is provided "as is", without warranty of any kind, express or
- *   implied, including but not limited to the warranties of merchantability,
- *   fitness for a particular purpose and noninfringement. In no event shall the
- *   authors or copyright holders be liable for any claim, damages or other
- *   liability, whether in an action of contract, tort or otherwise, arising from,
- *   out of or in connection with the Software or the use or other dealings in the
- *   Software.
- */
-
-#pragma once
-
-#include <complex>
-#include <vector>
-using Complex = std::complex<double>;
-
-namespace Fft {
-	
-	/* 
-	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
-	 * The vector can have any length. This is a wrapper function. The inverse transform does not perform scaling, so it is not a true inverse.
-	 */
-	void transform(std::vector<std::complex<double> > &vec, bool inverse);
-	
-	
-	/* 
-	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
-	 * The vector's length must be a power of 2. Uses the Cooley-Tukey decimation-in-time radix-2 algorithm.
-	 */
-	void transformRadix2(std::vector<std::complex<double> > &vec, bool inverse);
-	
-	
-	/* 
-	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
-	 * The vector can have any length. This requires the convolution function, which in turn requires the radix-2 FFT function.
-	 * Uses Bluestein's chirp z-transform algorithm.
-	 */
-	void transformBluestein(std::vector<std::complex<double> > &vec, bool inverse);
-	
-	
-	/* 
-	 * Computes the circular convolution of the given complex vectors. Each vector's length must be the same.
-	 */
-	std::vector<std::complex<double> > convolve(
-		std::vector<std::complex<double> > xvec,
-		std::vector<std::complex<double> > yvec);
-	
-}
diff --git a/src/ref/FftComplexTest.cpp b/src/ref/FftComplexTest.cpp
deleted file mode 100644
index dd960d1..0000000
--- a/src/ref/FftComplexTest.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/* 
- * FFT and convolution test (C++)
- * 
- * Copyright (c) 2021 Project Nayuki. (MIT License)
- * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
- * the Software, and to permit persons to whom the Software is furnished to do so,
- * subject to the following conditions:
- * - The above copyright notice and this permission notice shall be included in
- *   all copies or substantial portions of the Software.
- * - The Software is provided "as is", without warranty of any kind, express or
- *   implied, including but not limited to the warranties of merchantability,
- *   fitness for a particular purpose and noninfringement. In no event shall the
- *   authors or copyright holders be liable for any claim, damages or other
- *   liability, whether in an action of contract, tort or otherwise, arising from,
- *   out of or in connection with the Software or the use or other dealings in the
- *   Software.
- */
-
-#include <algorithm>
-#include <cmath>
-#include <complex>
-#include <cstdlib>
-#include <iomanip>
-#include <iostream>
-#include <random>
-#include <utility>
-#include <vector>
-#include "FftComplex.hpp"
-
-using std::complex;
-using std::cout;
-using std::endl;
-using std::vector;
-
-
-// Private function prototypes
-static void testFft(int n);
-static void testConvolution(int n);
-static vector<complex<double> > naiveDft(const vector<complex<double> > &input, bool inverse);
-static vector<complex<double> > naiveConvolve(const vector<complex<double> > &xvec, const vector<complex<double> > &yvec);
-static double log10RmsErr(const vector<complex<double> > &xvec, const vector<complex<double> > &yvec);
-static vector<complex<double> > randomComplexes(int n);
-
-// Mutable global variable
-static double maxLogError = -INFINITY;
-
-// Random number generation
-std::default_random_engine randGen((std::random_device())());
-
-
-/*---- Main and test functions ----*/
-
-int main() {
-	// Test power-of-2 size FFTs
-	for (int i = 0; i <= 12; i++)
-		testFft(1 << i);
-	
-	// Test small size FFTs
-	for (int i = 0; i < 30; i++)
-		testFft(i);
-	
-	// Test diverse size FFTs
-	for (int i = 0, prev = 0; i <= 100; i++) {
-		int n = static_cast<int>(std::lround(std::pow(1500.0, i / 100.0)));
-		if (n > prev) {
-			testFft(n);
-			prev = n;
-		}
-	}
-	
-	// Test power-of-2 size convolutions
-	for (int i = 0; i <= 12; i++)
-		testConvolution(1 << i);
-	
-	// Test diverse size convolutions
-	for (int i = 0, prev = 0; i <= 100; i++) {
-		int n = static_cast<int>(std::lround(std::pow(1500.0, i / 100.0)));
-		if (n > prev) {
-			testConvolution(n);
-			prev = n;
-		}
-	}
-	
-	cout << endl;
-	cout << "Max log err = " << std::setprecision(3) << maxLogError << endl;
-	cout << "Test " << (maxLogError < -10 ? "passed" : "failed") << endl;
-	return EXIT_SUCCESS;
-}
-
-
-static void testFft(int n) {
-	const vector<complex<double> > input = randomComplexes(n);
-	const vector<complex<double> > expect = naiveDft(input, false);
-	vector<complex<double> > actual = input;
-	Fft::transform(actual, false);
-	double err = log10RmsErr(expect, actual);
-	
-	for (auto it = actual.begin(); it != actual.end(); ++it)
-		*it /= n;
-	Fft::transform(actual, true);
-	err = std::max(log10RmsErr(input, actual), err);
-	cout << "fftsize=" << std::setw(4) << std::setfill(' ') << n << "  "
-	     << "logerr=" << std::setw(5) << std::setprecision(3) << std::setiosflags(std::ios::showpoint)
-	     << err << endl;
-}
-
-
-static void testConvolution(int n) {
-	const vector<complex<double> > input0 = randomComplexes(n);
-	const vector<complex<double> > input1 = randomComplexes(n);
-	const vector<complex<double> > expect = naiveConvolve(input0, input1);
-	const vector<complex<double> > actual = Fft::convolve(std::move(input0), std::move(input1));
-	cout << "convsize=" << std::setw(4) << std::setfill(' ') << n << "  "
-	     << "logerr=" << std::setw(5) << std::setprecision(3) << std::setiosflags(std::ios::showpoint)
-	     << log10RmsErr(expect, actual) << endl;
-}
-
-
-/*---- Naive reference computation functions ----*/
-
-static vector<complex<double> > naiveDft(const vector<complex<double> > &input, bool inverse) {
-	int n = static_cast<int>(input.size());
-	vector<complex<double> > output;
-	double coef = (inverse ? 2 : -2) * M_PI / n;
-	for (int k = 0; k < n; k++) {  // For each output element
-		complex<double> sum(0);
-		for (int t = 0; t < n; t++) {  // For each input element
-			double angle = coef * (static_cast<long long>(t) * k % n);
-			sum += input[t] * std::polar(1.0, angle);
-		}
-		output.push_back(sum);
-	}
-	return output;
-}
-
-
-static vector<complex<double> > naiveConvolve(
-		const vector<complex<double> > &xvec, const vector<complex<double> > &yvec) {
-	int n = static_cast<int>(xvec.size());
-	vector<complex<double> > result(n);  // All zeros
-	for (int i = 0; i < n; i++) {
-		for (int j = 0; j < n; j++) {
-			int k = (i + j) % n;
-			result[k] += xvec[i] * yvec[j];
-		}
-	}
-	return result;
-}
-
-
-/*---- Utility functions ----*/
-
-static double log10RmsErr(const vector<complex<double> > &xvec, const vector<complex<double> > &yvec) {
-	int n = static_cast<int>(xvec.size());
-	double err = std::pow(10, -99 * 2);
-	for (int i = 0; i < n; i++)
-		err += std::norm(xvec.at(i) - yvec.at(i));
-	err /= n > 0 ? n : 1;
-	err = std::sqrt(err);  // Now this is a root mean square (RMS) error
-	err = std::log10(err);
-	maxLogError = std::max(err, maxLogError);
-	return err;
-}
-
-
-static vector<complex<double> > randomComplexes(int n) {
-	std::uniform_real_distribution<double> valueDist(-1.0, 1.0);
-	vector<complex<double> > result;
-	for (int i = 0; i < n; i++)
-		result.push_back(complex<double>(valueDist(randGen), valueDist(randGen)));
-	return result;
-}
diff --git a/src/ref/FftRealPair.cpp b/src/ref/FftRealPair.cpp
deleted file mode 100644
index 9bb414f..0000000
--- a/src/ref/FftRealPair.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/* 
- * Free FFT and convolution (C++)
- * 
- * Copyright (c) 2021 Project Nayuki. (MIT License)
- * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
- * the Software, and to permit persons to whom the Software is furnished to do so,
- * subject to the following conditions:
- * - The above copyright notice and this permission notice shall be included in
- *   all copies or substantial portions of the Software.
- * - The Software is provided "as is", without warranty of any kind, express or
- *   implied, including but not limited to the warranties of merchantability,
- *   fitness for a particular purpose and noninfringement. In no event shall the
- *   authors or copyright holders be liable for any claim, damages or other
- *   liability, whether in an action of contract, tort or otherwise, arising from,
- *   out of or in connection with the Software or the use or other dealings in the
- *   Software.
- */
-
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include "FftRealPair.hpp"
-
-using std::size_t;
-using std::uintmax_t;
-using std::vector;
-
-
-// Private function prototypes
-static size_t reverseBits(size_t val, int width);
-
-
-void Fft::transform(vector<double> &real, vector<double> &imag) {
-	size_t n = real.size();
-	if (n != imag.size())
-		throw std::invalid_argument("Mismatched lengths");
-	if (n == 0)
-		return;
-	else if ((n & (n - 1)) == 0)  // Is power of 2
-		transformRadix2(real, imag);
-	else  // More complicated algorithm for arbitrary sizes
-		transformBluestein(real, imag);
-}
-
-
-void Fft::inverseTransform(vector<double> &real, vector<double> &imag) {
-	transform(imag, real);
-}
-
-
-void Fft::transformRadix2(vector<double> &real, vector<double> &imag) {
-	// Length variables
-	size_t n = real.size();
-	if (n != imag.size())
-		throw std::invalid_argument("Mismatched lengths");
-	int levels = 0;  // Compute levels = floor(log2(n))
-	for (size_t temp = n; temp > 1U; temp >>= 1)
-		levels++;
-	if (static_cast<size_t>(1U) << levels != n)
-		throw std::domain_error("Length is not a power of 2");
-	
-	// Trigonometric tables
-	vector<double> cosTable(n / 2);
-	vector<double> sinTable(n / 2);
-	for (size_t i = 0; i < n / 2; i++) {
-		cosTable[i] = std::cos(2 * M_PI * i / n);
-		sinTable[i] = std::sin(2 * M_PI * i / n);
-	}
-	
-	// Bit-reversed addressing permutation
-	for (size_t i = 0; i < n; i++) {
-		size_t j = reverseBits(i, levels);
-		if (j > i) {
-			std::swap(real[i], real[j]);
-			std::swap(imag[i], imag[j]);
-		}
-	}
-	
-	// Cooley-Tukey decimation-in-time radix-2 FFT
-	for (size_t size = 2; size <= n; size *= 2) {
-		size_t halfsize = size / 2;
-		size_t tablestep = n / size;
-		for (size_t i = 0; i < n; i += size) {
-			for (size_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) {
-				size_t l = j + halfsize;
-				double tpre =  real[l] * cosTable[k] + imag[l] * sinTable[k];
-				double tpim = -real[l] * sinTable[k] + imag[l] * cosTable[k];
-				real[l] = real[j] - tpre;
-				imag[l] = imag[j] - tpim;
-				real[j] += tpre;
-				imag[j] += tpim;
-			}
-		}
-		if (size == n)  // Prevent overflow in 'size *= 2'
-			break;
-	}
-}
-
-
-void Fft::transformBluestein(vector<double> &real, vector<double> &imag) {
-	// Find a power-of-2 convolution length m such that m >= n * 2 + 1
-	size_t n = real.size();
-	if (n != imag.size())
-		throw std::invalid_argument("Mismatched lengths");
-	size_t m = 1;
-	while (m / 2 <= n) {
-		if (m > SIZE_MAX / 2)
-			throw std::length_error("Vector too large");
-		m *= 2;
-	}
-	
-	// Trigonometric tables
-	vector<double> cosTable(n), sinTable(n);
-	for (size_t i = 0; i < n; i++) {
-		uintmax_t temp = static_cast<uintmax_t>(i) * i;
-		temp %= static_cast<uintmax_t>(n) * 2;
-		double angle = M_PI * temp / n;
-		cosTable[i] = std::cos(angle);
-		sinTable[i] = std::sin(angle);
-	}
-	
-	// Temporary vectors and preprocessing
-	vector<double> areal(m), aimag(m);
-	for (size_t i = 0; i < n; i++) {
-		areal[i] =  real[i] * cosTable[i] + imag[i] * sinTable[i];
-		aimag[i] = -real[i] * sinTable[i] + imag[i] * cosTable[i];
-	}
-	vector<double> breal(m), bimag(m);
-	breal[0] = cosTable[0];
-	bimag[0] = sinTable[0];
-	for (size_t i = 1; i < n; i++) {
-		breal[i] = breal[m - i] = cosTable[i];
-		bimag[i] = bimag[m - i] = sinTable[i];
-	}
-	
-	// Convolution
-	std::pair<vector<double>, vector<double> > cvec = convolve(areal, aimag, breal, bimag);
-	vector<double> creal = std::move(cvec.first );
-	vector<double> cimag = std::move(cvec.second);
-	
-	// Postprocessing
-	for (size_t i = 0; i < n; i++) {
-		real[i] =  creal[i] * cosTable[i] + cimag[i] * sinTable[i];
-		imag[i] = -creal[i] * sinTable[i] + cimag[i] * cosTable[i];
-	}
-}
-
-
-vector<double> Fft::convolve(vector<double> xvec, vector<double> yvec) {
-	size_t n = xvec.size();
-	if (n != yvec.size())
-		throw std::invalid_argument("Mismatched lengths");
-	return convolve(std::move(xvec), vector<double>(n), std::move(yvec), vector<double>(n)).first;
-}
-
-
-std::pair<vector<double>, vector<double> > Fft::convolve(
-		vector<double> xreal, vector<double> ximag,
-		vector<double> yreal, vector<double> yimag) {
-	
-	size_t n = xreal.size();
-	if (n != ximag.size() || n != yreal.size() || n != yimag.size())
-		throw std::invalid_argument("Mismatched lengths");
-	
-	transform(xreal, ximag);
-	transform(yreal, yimag);
-	
-	for (size_t i = 0; i < n; i++) {
-		double temp = xreal[i] * yreal[i] - ximag[i] * yimag[i];
-		ximag[i] = ximag[i] * yreal[i] + xreal[i] * yimag[i];
-		xreal[i] = temp;
-	}
-	inverseTransform(xreal, ximag);
-	
-	for (size_t i = 0; i < n; i++) {  // Scaling (because this FFT implementation omits it)
-		xreal[i] /= n;
-		ximag[i] /= n;
-	}
-	return std::pair<vector<double>, vector<double> >(std::move(xreal), std::move(ximag));
-}
-
-
-static size_t reverseBits(size_t val, int width) {
-	size_t result = 0;
-	for (int i = 0; i < width; i++, val >>= 1)
-		result = (result << 1) | (val & 1U);
-	return result;
-}
-
diff --git a/src/ref/FftRealPair.hpp b/src/ref/FftRealPair.hpp
deleted file mode 100644
index 95db10a..0000000
--- a/src/ref/FftRealPair.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* 
- * Free FFT and convolution (C++)
- * 
- * Copyright (c) 2021 Project Nayuki. (MIT License)
- * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
- * the Software, and to permit persons to whom the Software is furnished to do so,
- * subject to the following conditions:
- * - The above copyright notice and this permission notice shall be included in
- *   all copies or substantial portions of the Software.
- * - The Software is provided "as is", without warranty of any kind, express or
- *   implied, including but not limited to the warranties of merchantability,
- *   fitness for a particular purpose and noninfringement. In no event shall the
- *   authors or copyright holders be liable for any claim, damages or other
- *   liability, whether in an action of contract, tort or otherwise, arising from,
- *   out of or in connection with the Software or the use or other dealings in the
- *   Software.
- */
-
-#pragma once
-
-#include <utility>
-#include <vector>
-
-
-namespace Fft {
-	
-	/* 
-	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
-	 * The vector can have any length. This is a wrapper function.
-	 */
-	void transform(std::vector<double> &real, std::vector<double> &imag);
-	
-	
-	/* 
-	 * Computes the inverse discrete Fourier transform (IDFT) of the given complex vector, storing the result back into the vector.
-	 * The vector can have any length. This is a wrapper function. This transform does not perform scaling, so the inverse is not a true inverse.
-	 */
-	void inverseTransform(std::vector<double> &real, std::vector<double> &imag);
-	
-	
-	/* 
-	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
-	 * The vector's length must be a power of 2. Uses the Cooley-Tukey decimation-in-time radix-2 algorithm.
-	 */
-	void transformRadix2(std::vector<double> &real, std::vector<double> &imag);
-	
-	
-	/* 
-	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
-	 * The vector can have any length. This requires the convolution function, which in turn requires the radix-2 FFT function.
-	 * Uses Bluestein's chirp z-transform algorithm.
-	 */
-	void transformBluestein(std::vector<double> &real, std::vector<double> &imag);
-	
-	
-	/* 
-	 * Computes the circular convolution of the given real vectors. Each vector's length must be the same.
-	 */
-	std::vector<double> convolve(std::vector<double> xvec, std::vector<double> yvec);
-	
-	
-	/* 
-	 * Computes the circular convolution of the given complex vectors. Each vector's length must be the same.
-	 */
-	std::pair<std::vector<double>, std::vector<double> > convolve(
-		std::vector<double> xreal, std::vector<double> ximag,
-		std::vector<double> yreal, std::vector<double> yimag);
-	
-}
-
diff --git a/src/ref/README.org b/src/ref/README.org
deleted file mode 100644
index 74fd6b9..0000000
--- a/src/ref/README.org
+++ /dev/null
@@ -1,5 +0,0 @@
-* Reference Implementation
-
-This reference implementation is provided by Nayuki. I greatly appreciate their contribution.
-
-For more details, please visit Nayuki's [[Website][https://www.nayuki.io/]]

From 4f6d79b0f2aac96398a5bd04727f9cf0d0ae60ab Mon Sep 17 00:00:00 2001
From: cfmc30 <cfmc.cs13@nycu.edu.tw>
Date: Thu, 12 Dec 2024 17:27:41 +0000
Subject: [PATCH 12/16] fix: refactor

---
 src/complex_array.hpp | 251 ++++++++++++++++++++++++------------------
 src/fft.cpp           |  35 +++---
 src/fft.hpp           |   2 +-
 3 files changed, 157 insertions(+), 131 deletions(-)

diff --git a/src/complex_array.hpp b/src/complex_array.hpp
index 8d0762c..0bdb5ac 100644
--- a/src/complex_array.hpp
+++ b/src/complex_array.hpp
@@ -7,142 +7,175 @@
 using Complex = std::complex<double>;
 
 class ComplexProxy {
+    double *real_ptr;
+    double *imag_ptr;
 
-  double *real_ptr;
-  double *imag_ptr;
-
-public:
-
-  double &real() { return *real_ptr; }
-  double &imag() { return *imag_ptr; }
-  
-  ComplexProxy(double *real_ptr, double *imag_ptr)
-      : real_ptr(real_ptr), imag_ptr(imag_ptr) {}
-
-  ComplexProxy(ComplexProxy &&rhs) noexcept
-      : real_ptr(rhs.real_ptr), imag_ptr(rhs.imag_ptr) {
-    rhs.real_ptr = nullptr;
-    rhs.imag_ptr = nullptr;
-  }
-  
-  operator Complex() const { return Complex(*real_ptr, *imag_ptr); }
-
-  
-  ComplexProxy &operator=(const Complex &c) {
-    *real_ptr = c.real();
-    *imag_ptr = c.imag();
-    return *this;
-  }
-};
+   public:
+    double &real() { return *real_ptr; }
+    double &imag() { return *imag_ptr; }
 
-class ComplexArray {
-private:
-  double *real;
-  double *imag;
-  size_t array_size;
-
-public:
-  ComplexArray(size_t n) : array_size(n) {
-    if (n == 0) {
-      real = nullptr;
-      imag = nullptr;
-    } else {
-      real = new double[n];
-      std::fill(real, real + n, 0.0);
-      imag = new double[n];
-      std::fill(imag, imag + n, 0.0);
-    }
-  }
+    ComplexProxy(double *real_ptr, double *imag_ptr)
+        : real_ptr(real_ptr), imag_ptr(imag_ptr) {}
 
-  ~ComplexArray() {
-    if (real != nullptr) {
-      delete[] real;
+    ComplexProxy(ComplexProxy &&rhs) noexcept
+        : real_ptr(rhs.real_ptr), imag_ptr(rhs.imag_ptr) {
+        rhs.real_ptr = nullptr;
+        rhs.imag_ptr = nullptr;
     }
-    if (imag != nullptr) {
-      delete[] imag;
+
+    operator Complex() const { return Complex(*real_ptr, *imag_ptr); }
+
+    ComplexProxy &operator=(const Complex &c) {
+        *real_ptr = c.real();
+        *imag_ptr = c.imag();
+        return *this;
     }
-  }
-
-  ComplexArray(const ComplexArray &other) : array_size(other.array_size) {
-    real = new double[array_size];
-    imag = new double[array_size];
-    std::copy(other.real, other.real + array_size, real);
-    std::copy(other.imag, other.imag + array_size, imag);
-  }
-
-  ComplexArray(ComplexArray &&rhs) noexcept
-      : real(rhs.real), imag(rhs.imag), array_size(rhs.array_size) {
-    rhs.real = nullptr;
-    rhs.imag = nullptr;
-    rhs.array_size = 0;
-  }
-
-  ComplexArray &operator=(const ComplexArray &other) {
-    if (this == &other) {
-      return *this;
+};
+
+class ComplexArray {
+   private:
+    double *real;
+    double *imag;
+    size_t array_size;
+
+   public:
+    ComplexArray(size_t n) : array_size(n) {
+        if (n == 0) {
+            real = nullptr;
+            imag = nullptr;
+        } else {
+            real = new double[n];
+            std::fill(real, real + n, 0.0);
+            imag = new double[n];
+            std::fill(imag, imag + n, 0.0);
+        }
     }
-    if (real != nullptr) {
-      delete[] real;
+
+    ~ComplexArray() {
+        if (real != nullptr) {
+            delete[] real;
+        }
+        if (imag != nullptr) {
+            delete[] imag;
+        }
     }
-    if (imag != nullptr) {
-      delete[] imag;
+
+    ComplexArray(const ComplexArray &other) : array_size(other.array_size) {
+        real = new double[array_size];
+        imag = new double[array_size];
+        std::copy(other.real, other.real + array_size, real);
+        std::copy(other.imag, other.imag + array_size, imag);
     }
 
-    array_size = other.array_size;
-    real = new double[array_size];
-    imag = new double[array_size];
+    ComplexArray(ComplexArray &&rhs) noexcept
+        : real(rhs.real), imag(rhs.imag), array_size(rhs.array_size) {
+        rhs.real = nullptr;
+        rhs.imag = nullptr;
+        rhs.array_size = 0;
+    }
 
-    for (size_t i = 0; i < array_size; i++) {
-      real[i] = other.real[i];
-      imag[i] = other.imag[i];
+    ComplexArray &operator=(const ComplexArray &other) {
+        if (this == &other) {
+            return *this;
+        }
+        if (real != nullptr) {
+            delete[] real;
+        }
+        if (imag != nullptr) {
+            delete[] imag;
+        }
+
+        array_size = other.array_size;
+        real = new double[array_size];
+        imag = new double[array_size];
+
+        for (size_t i = 0; i < array_size; i++) {
+            real[i] = other.real[i];
+            imag[i] = other.imag[i];
+        }
+
+        return *this;
     }
 
-    return *this;
-  }
+    void multiply(const ComplexArray &other) {
+        if (array_size != other.array_size) {
+            throw std::invalid_argument("Array sizes do not match");
+        }
+
+        for (size_t i = 0; i < array_size; i++) {
+            double temp_real =
+                real[i] * other.real[i] - imag[i] * other.imag[i];
+            double temp_imag =
+                real[i] * other.imag[i] + imag[i] * other.real[i];
+            real[i] = temp_real;
+            imag[i] = temp_imag;
+        }
+    }
 
-  ComplexArray operator*(const ComplexArray &other) const {
-    if (array_size != other.array_size) {
-      throw std::invalid_argument("Array sizes do not match");
+    // void partial_multiply(const ComplexArray &other, size_t start, size_t length) {
+        
+    //     size_t end = start + length;
+    //     if (end > array_size || end > other.array_size) {
+    //         throw std::invalid_argument("Array sizes do not match");
+    //     }
+    //     for (size_t i = 0; i < length; i++) {
+    //         double temp_real =
+    //             real[start + i] * other.real[i] - imag[start + i] * other.imag[i];
+    //         double temp_imag =
+    //             real[start + i] * other.imag[i] + imag[start + i] * other.real[i];
+    //         real[start + i] = temp_real;
+    //         imag[start + i] = temp_imag;
+    //     }
+    // }
+
+    void scale(double scalar) {
+        for (size_t i = 0; i < array_size; i++) {
+            real[i] *= scalar;
+            imag[i] *= scalar;
+        }
     }
 
-    ComplexArray result(array_size);
-    for (size_t i = 0; i < array_size; i++) {
-      result.real[i] = real[i] * other.real[i] - imag[i] * other.imag[i];
-      result.imag[i] = real[i] * other.imag[i] + imag[i] * other.real[i];
+    ComplexArray operator*(const ComplexArray &other) const {
+        if (array_size != other.array_size) {
+            throw std::invalid_argument("Array sizes do not match");
+        }
+        ComplexArray result = *this;  // Copy constructor
+        result.multiply(other);
+        return result;
     }
-    return result;
-  }
 
-  ComplexArray operator+(const ComplexArray &other) const {
-    if (array_size != other.array_size) {
-      throw std::invalid_argument("Array sizes do not match");
+    ComplexArray operator+(const ComplexArray &other) const {
+        if (array_size != other.array_size) {
+            throw std::invalid_argument("Array sizes do not match");
+        }
+
+        ComplexArray result(array_size);
+        for (size_t i = 0; i < array_size; i++) {
+            result.real[i] = real[i] + other.real[i];
+            result.imag[i] = imag[i] + other.imag[i];
+        }
+        return result;
     }
 
-    ComplexArray result(array_size);
-    for (size_t i = 0; i < array_size; i++) {
-      result.real[i] = real[i] + other.real[i];
-      result.imag[i] = imag[i] + other.imag[i];
+    Complex operator[](size_t index) const {
+        return Complex(real[index], imag[index]);
     }
-    return result;
-  }
 
-  Complex operator[](size_t index) const {
-    return Complex(real[index], imag[index]);
-  }
+    ComplexProxy operator[](size_t index) {
+        return ComplexProxy(real + index, imag + index);
+    }
 
-  ComplexProxy operator[](size_t index) {
-    return ComplexProxy(real + index, imag + index);
-  }
+    
 
-  size_t size() const { return array_size; }
+    size_t size() const { return array_size; }
 
-  double *get_real_ptr() { return real; }
+    double *get_real_ptr() { return real; }
 
-  const double *get_real_ptr() const { return real; }
+    const double *get_real_ptr() const { return real; }
 
-  double *get_imag_ptr() { return imag; }
+    double *get_imag_ptr() { return imag; }
 
-  const double *get_imag_ptr() const { return imag; }
+    const double *get_imag_ptr() const { return imag; }
 };
 
 #endif
diff --git a/src/fft.cpp b/src/fft.cpp
index 692d3f3..08f0525 100644
--- a/src/fft.cpp
+++ b/src/fft.cpp
@@ -86,13 +86,14 @@ void fft::transformBluestein(ComplexArray &x, bool inverse) {
   double *real_avec = a.get_real_ptr();
   double *imag_avec = a.get_imag_ptr();
   
+  
+  // TODO: Use SIMD
   for (size_t i = 0; i < n; i++) {
     real_avec[i] = real[i] * real_expTable[i] - imag[i] * imag_expTable[i];
     imag_avec[i] = real[i] * imag_expTable[i] + imag[i] * real_expTable[i];
+    // a[i] = Complex(x[i]) * Complex(expTable[i]);
   }
   
-  // a = x * expTable;
-  
   double *real_bvec = b.get_real_ptr();
   double *imag_bvec = b.get_imag_ptr();
   real_bvec[0] = real_expTable[0];
@@ -104,36 +105,28 @@ void fft::transformBluestein(ComplexArray &x, bool inverse) {
   }
   
   ComplexArray c = convolove(a, b);
-
+  // x = c * expTable;
   for (size_t i = 0; i < n; i++) {
     real[i] = c.get_real_ptr()[i] * real_expTable[i] - c.get_imag_ptr()[i] * imag_expTable[i];
     imag[i] = c.get_real_ptr()[i] * imag_expTable[i] + c.get_imag_ptr()[i] * real_expTable[i];
   }
 }
 
-ComplexArray fft::convolove(ComplexArray x, ComplexArray y) {
-  size_t n = x.size();
-  if (n != y.size())
+ComplexArray fft::convolove(const ComplexArray &_x, const ComplexArray &_y) {
+  size_t n = _x.size();
+  if (n != _y.size())
     throw std::domain_error("Mismatched lengths");
+  
+  ComplexArray x = _x;
+  ComplexArray y = _y;
+
   transform(x, false);
   transform(y, false);
 
-  double *xr = x.get_real_ptr();
-  double *yr = y.get_real_ptr();
-  double *xi = x.get_imag_ptr();
-  double *yi = y.get_imag_ptr();
-  
-  for (size_t i = 0; i < n; ++i) {
-    double xrn = xr[i] * yr[i] - xi[i] * yi[i];
-    double xin = xr[i] * yi[i] + xi[i] * yr[i];
-    xr[i] = xrn;
-    xi[i] = xin;
-  }
+  x.multiply(y);
   transform(x, true);
-  for (size_t i = 0; i < n; ++i) {
-    xr[i] /= static_cast<double>(n);
-    xi[i] /= static_cast<double>(n);
-  }
+  x.scale(1.0 / n);
+  
   return x;
 }
 
diff --git a/src/fft.hpp b/src/fft.hpp
index c4620fb..768a7fe 100644
--- a/src/fft.hpp
+++ b/src/fft.hpp
@@ -10,6 +10,6 @@ void transformBluestein(ComplexArray &x, bool inverse);
 void transform(ComplexArray &x, bool inverse);
 
 // ComplexArray convolove(const ComplexArray &x, const ComplexArray &y);
-ComplexArray convolove(ComplexArray x, ComplexArray y);
+ComplexArray convolove(const ComplexArray &x, const ComplexArray &y);
 } // namespace fft
 

From 7299b11bfa78d2e868acafb45ac5c20382d434ae Mon Sep 17 00:00:00 2001
From: cfmc30 <cfmc.cs13@nycu.edu.tw>
Date: Sat, 14 Dec 2024 09:13:10 +0000
Subject: [PATCH 13/16] fix: refractor and prepare for SIMD

---
 src/complex_array.cpp | 181 ++++++++++++++++++++++++++++++++++++++++++
 src/complex_array.hpp |   4 +-
 src/dct_naive.hpp     |  17 ++--
 src/fdct.hpp          | 164 ++++++++++++++++++++------------------
 src/fdct_inner.cpp    | 108 +++++--------------------
 src/fdct_inner.hpp    |   4 +-
 src/fft.cpp           |  52 +++++++-----
 src/fft.hpp           |  10 +--
 test_dct.py           |   7 +-
 9 files changed, 337 insertions(+), 210 deletions(-)
 create mode 100644 src/complex_array.cpp

diff --git a/src/complex_array.cpp b/src/complex_array.cpp
new file mode 100644
index 0000000..0bdb5ac
--- /dev/null
+++ b/src/complex_array.cpp
@@ -0,0 +1,181 @@
+#ifndef COMPLEX_ARRAY_HPP
+#define COMPLEX_ARRAY_HPP
+#include <algorithm>
+#include <complex>
+#include <stdexcept>
+
+using Complex = std::complex<double>;
+
+class ComplexProxy {
+    double *real_ptr;
+    double *imag_ptr;
+
+   public:
+    double &real() { return *real_ptr; }
+    double &imag() { return *imag_ptr; }
+
+    ComplexProxy(double *real_ptr, double *imag_ptr)
+        : real_ptr(real_ptr), imag_ptr(imag_ptr) {}
+
+    ComplexProxy(ComplexProxy &&rhs) noexcept
+        : real_ptr(rhs.real_ptr), imag_ptr(rhs.imag_ptr) {
+        rhs.real_ptr = nullptr;
+        rhs.imag_ptr = nullptr;
+    }
+
+    operator Complex() const { return Complex(*real_ptr, *imag_ptr); }
+
+    ComplexProxy &operator=(const Complex &c) {
+        *real_ptr = c.real();
+        *imag_ptr = c.imag();
+        return *this;
+    }
+};
+
+class ComplexArray {
+   private:
+    double *real;
+    double *imag;
+    size_t array_size;
+
+   public:
+    ComplexArray(size_t n) : array_size(n) {
+        if (n == 0) {
+            real = nullptr;
+            imag = nullptr;
+        } else {
+            real = new double[n];
+            std::fill(real, real + n, 0.0);
+            imag = new double[n];
+            std::fill(imag, imag + n, 0.0);
+        }
+    }
+
+    ~ComplexArray() {
+        if (real != nullptr) {
+            delete[] real;
+        }
+        if (imag != nullptr) {
+            delete[] imag;
+        }
+    }
+
+    ComplexArray(const ComplexArray &other) : array_size(other.array_size) {
+        real = new double[array_size];
+        imag = new double[array_size];
+        std::copy(other.real, other.real + array_size, real);
+        std::copy(other.imag, other.imag + array_size, imag);
+    }
+
+    ComplexArray(ComplexArray &&rhs) noexcept
+        : real(rhs.real), imag(rhs.imag), array_size(rhs.array_size) {
+        rhs.real = nullptr;
+        rhs.imag = nullptr;
+        rhs.array_size = 0;
+    }
+
+    ComplexArray &operator=(const ComplexArray &other) {
+        if (this == &other) {
+            return *this;
+        }
+        if (real != nullptr) {
+            delete[] real;
+        }
+        if (imag != nullptr) {
+            delete[] imag;
+        }
+
+        array_size = other.array_size;
+        real = new double[array_size];
+        imag = new double[array_size];
+
+        for (size_t i = 0; i < array_size; i++) {
+            real[i] = other.real[i];
+            imag[i] = other.imag[i];
+        }
+
+        return *this;
+    }
+
+    void multiply(const ComplexArray &other) {
+        if (array_size != other.array_size) {
+            throw std::invalid_argument("Array sizes do not match");
+        }
+
+        for (size_t i = 0; i < array_size; i++) {
+            double temp_real =
+                real[i] * other.real[i] - imag[i] * other.imag[i];
+            double temp_imag =
+                real[i] * other.imag[i] + imag[i] * other.real[i];
+            real[i] = temp_real;
+            imag[i] = temp_imag;
+        }
+    }
+
+    // void partial_multiply(const ComplexArray &other, size_t start, size_t length) {
+        
+    //     size_t end = start + length;
+    //     if (end > array_size || end > other.array_size) {
+    //         throw std::invalid_argument("Array sizes do not match");
+    //     }
+    //     for (size_t i = 0; i < length; i++) {
+    //         double temp_real =
+    //             real[start + i] * other.real[i] - imag[start + i] * other.imag[i];
+    //         double temp_imag =
+    //             real[start + i] * other.imag[i] + imag[start + i] * other.real[i];
+    //         real[start + i] = temp_real;
+    //         imag[start + i] = temp_imag;
+    //     }
+    // }
+
+    void scale(double scalar) {
+        for (size_t i = 0; i < array_size; i++) {
+            real[i] *= scalar;
+            imag[i] *= scalar;
+        }
+    }
+
+    ComplexArray operator*(const ComplexArray &other) const {
+        if (array_size != other.array_size) {
+            throw std::invalid_argument("Array sizes do not match");
+        }
+        ComplexArray result = *this;  // Copy constructor
+        result.multiply(other);
+        return result;
+    }
+
+    ComplexArray operator+(const ComplexArray &other) const {
+        if (array_size != other.array_size) {
+            throw std::invalid_argument("Array sizes do not match");
+        }
+
+        ComplexArray result(array_size);
+        for (size_t i = 0; i < array_size; i++) {
+            result.real[i] = real[i] + other.real[i];
+            result.imag[i] = imag[i] + other.imag[i];
+        }
+        return result;
+    }
+
+    Complex operator[](size_t index) const {
+        return Complex(real[index], imag[index]);
+    }
+
+    ComplexProxy operator[](size_t index) {
+        return ComplexProxy(real + index, imag + index);
+    }
+
+    
+
+    size_t size() const { return array_size; }
+
+    double *get_real_ptr() { return real; }
+
+    const double *get_real_ptr() const { return real; }
+
+    double *get_imag_ptr() { return imag; }
+
+    const double *get_imag_ptr() const { return imag; }
+};
+
+#endif
diff --git a/src/complex_array.hpp b/src/complex_array.hpp
index 0bdb5ac..911d8fa 100644
--- a/src/complex_array.hpp
+++ b/src/complex_array.hpp
@@ -97,7 +97,7 @@ class ComplexArray {
         return *this;
     }
 
-    void multiply(const ComplexArray &other) {
+    void multiply(const ComplexArray &other, bool simd = false) {
         if (array_size != other.array_size) {
             throw std::invalid_argument("Array sizes do not match");
         }
@@ -128,7 +128,7 @@ class ComplexArray {
     //     }
     // }
 
-    void scale(double scalar) {
+    void scale(double scalar, bool simd = false) {
         for (size_t i = 0; i < array_size; i++) {
             real[i] *= scalar;
             imag[i] *= scalar;
diff --git a/src/dct_naive.hpp b/src/dct_naive.hpp
index 7764d41..07e19fe 100644
--- a/src/dct_naive.hpp
+++ b/src/dct_naive.hpp
@@ -12,7 +12,7 @@ namespace py = pybind11;
 constexpr double SQRT2_1 = 0.7071067811865475;
 constexpr double ONE_SUB_SQRT2_1 = 1 - 0.7071067811865475;
 
-static inline double dct_naive_single(double *buf, size_t buf_size, size_t i) {
+static inline double dct_naive_single(double *buf, size_t buf_size, size_t i, bool simd) {
   double sum = 0.0f;
   for (size_t j = 0; j < buf_size; j++) {
     sum += buf[j] * cos(M_PI / buf_size * i * (j + 0.5));
@@ -28,20 +28,20 @@ static inline double idct_naive_single(double *buf, size_t buf_size, size_t i) {
   return sum;
 }
 
-void dct_naive_inner(double *buf, size_t buf_size, double *out) {
+void dct_naive_inner(double *buf, size_t buf_size, double *out, bool simd) {
   for (size_t i = 0; i < buf_size; i++) {
-    out[i] = 2.0 * dct_naive_single(buf, buf_size, i);
+    out[i] = 2.0 * dct_naive_single(buf, buf_size, i, simd);
   }
 }
 
-void idct_naive_inner(double *buf, size_t buf_size, double *out) {
+void idct_naive_inner(double *buf, size_t buf_size, double *out, bool simd) {
   for (size_t i = 0; i < buf_size; i++) {
     out[i] = (buf[0] + (2 * idct_naive_single(buf, buf_size, i))) / (2.0 * buf_size);
   }
 }
 
 py::array dct_naive(
-  py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
+  py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
   py::buffer_info buf_info = in_arr.request();
   double *ptr = static_cast<double *>(buf_info.ptr);
   size_t buf_size = buf_info.size;
@@ -49,13 +49,13 @@ py::array dct_naive(
   py::array_t<double> out_arr(buf_info.shape);
 
   double *out_buf = static_cast<double *>(out_arr.request(true).ptr);
-  dct_naive_inner(ptr, buf_size, out_buf);
+  dct_naive_inner(ptr, buf_size, out_buf, simd);
 
   return out_arr;
 }
 
 py::array idct_naive(
-  py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
+  py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
   py::buffer_info buf_info = in_arr.request();
   double *ptr = static_cast<double *>(buf_info.ptr);
   size_t buf_size = buf_info.size;
@@ -63,7 +63,8 @@ py::array idct_naive(
   py::array_t<double> out_arr(buf_info.shape);
 
   double *out_buf = static_cast<double *>(out_arr.request(true).ptr);
-  idct_naive_inner(ptr, buf_size, out_buf);
+  idct_naive_inner(ptr, buf_size, out_buf, simd);
 
   return out_arr;
 }
+
diff --git a/src/fdct.hpp b/src/fdct.hpp
index ee08687..476d7fc 100644
--- a/src/fdct.hpp
+++ b/src/fdct.hpp
@@ -1,105 +1,111 @@
 #pragma once
 
-#include <cassert>
 #include <fftw3.h>
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include <iostream>
+#include <cassert>
 #include <cmath>
 #include <complex>
 #include <cstddef>
+#include <iostream>
 
 // #include "FftComplex.hpp"
 #include "complex_array.hpp"
-#include "fft.hpp"
 #include "fdct_inner.hpp"
-
+#include "fft.hpp"
 
 namespace py = pybind11;
 #define _USE_MATH_DEFINES
 
 // Wrapper function to accept and return NumPy arrays
-py::array
-fdct(py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
-  // Request a buffer from the input array
-  py::buffer_info buf = in_arr.request();
-  if (buf.ndim != 1) {
-    throw std::runtime_error("Input array must be 1-dimensional.");
-  }
-
-  // Convert input NumPy array to std::vector
-  size_t N = buf.shape[0];
-  ComplexArray input_data(N);
-  std::copy(static_cast<double *>(buf.ptr), static_cast<double *>(buf.ptr) + N,
-            input_data.get_real_ptr());
-  std::fill(input_data.get_imag_ptr(), input_data.get_imag_ptr() + N, 0.0);
-  // Perform FCT
-  std::vector<double> result = fct_inner(input_data);
-
-  // Convert result back to NumPy array
-  return py::array_t<double>(result.size(), result.data());
+py::array fdct(
+    py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
+    // Request a buffer from the input array
+    py::buffer_info buf = in_arr.request();
+    // Check if the input array is 1-dimensional or 2-dimensional
+
+    if (buf.ndim > 2) {
+        throw std::runtime_error(
+            "Input array must be 1-dimensional or 2-dimensional.");
+    }
+
+    // Convert input NumPy array to std::vector
+    size_t N = buf.shape[0];
+    ComplexArray input_data(N);
+    std::copy(static_cast<double *>(buf.ptr),
+              static_cast<double *>(buf.ptr) + N, input_data.get_real_ptr());
+    std::fill(input_data.get_imag_ptr(), input_data.get_imag_ptr() + N, 0.0);
+    // Perform FCT
+    double *result = new double[N];
+    fct_inner(input_data, result, simd);
+
+    // Convert result back to NumPy array
+    return py::array_t<double>(N, result);
 }
 
 std::vector<double> idct_inner(const std::vector<double> &c) {
-  size_t N = c.size();
-  std::vector<double> x(N);
-  // auto u = reorderOutput(c);
-  ComplexArray u(N);
-  // Allocate FFTW input/output arrays
-  fftw_complex *fft_in = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
-  fftw_complex *fft_out = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
-
-  // Initialize FFTW plan
-  fftw_plan plan =
-      fftw_plan_dft_1d(N, fft_in, fft_out, FFTW_BACKWARD, FFTW_ESTIMATE);
-
-  // Fill FFTW input array
-  for (size_t n = 0; n < N; ++n) {
-    fft_in[n][0] = c[n]; // Real part
-    fft_in[n][1] = 0.0;  // Imaginary part
-  }
-
-  // Execute IFFT
-  fftw_execute(plan);
-
-  // Compute the IDCT using the IFFT output
-  for (size_t k = 0; k < N; ++k) {
-    double real = fft_out[k][0];
-    double imag = fft_out[k][1];
-    x[k] =
-        (real * cos(M_PI * k / (2.0 * N)) + imag * sin(M_PI * k / (2.0 * N))) /
-        N;
-  }
-
-  // Free FFTW resources
-  fftw_destroy_plan(plan);
-  fftw_free(fft_in);
-  fftw_free(fft_out);
-
-  return x;
+    size_t N = c.size();
+    std::vector<double> x(N);
+    // auto u = reorderOutput(c);
+    ComplexArray u(N);
+    // Allocate FFTW input/output arrays
+    fftw_complex *fft_in =
+        (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
+    fftw_complex *fft_out =
+        (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
+
+    // Initialize FFTW plan
+    fftw_plan plan =
+        fftw_plan_dft_1d(N, fft_in, fft_out, FFTW_BACKWARD, FFTW_ESTIMATE);
+
+    // Fill FFTW input array
+    for (size_t n = 0; n < N; ++n) {
+        fft_in[n][0] = c[n];  // Real part
+        fft_in[n][1] = 0.0;   // Imaginary part
+    }
+
+    // Execute IFFT
+    fftw_execute(plan);
+
+    // Compute the IDCT using the IFFT output
+    for (size_t k = 0; k < N; ++k) {
+        double real = fft_out[k][0];
+        double imag = fft_out[k][1];
+        x[k] = (real * cos(M_PI * k / (2.0 * N)) +
+                imag * sin(M_PI * k / (2.0 * N))) /
+               N;
+    }
+
+    // Free FFTW resources
+    fftw_destroy_plan(plan);
+    fftw_free(fft_in);
+    fftw_free(fft_out);
+
+    return x;
 }
 
 // Wrapper function to accept and return NumPy arrays
-py::array
-ifdct(py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
-  // Request a buffer from the input array
-  py::buffer_info buf = in_arr.request();
-  if (buf.ndim != 1) {
-    throw std::runtime_error("Input array must be 1-dimensional.");
-  }
-
-  // Convert input NumPy array to std::vector
-  size_t N = buf.shape[0];
-  // std::vector<double> input_data(static_cast<double *>(buf.ptr),
-  //                               static_cast<double *>(buf.ptr) + N);
-  ComplexArray input_data(N);
-  std::copy(static_cast<double *>(buf.ptr), static_cast<double *>(buf.ptr) + N,
-	    input_data.get_real_ptr());
-  
-  // Perform IDCT
-  std::vector<double> result = ifdct_inner(input_data);
-  // Convert result back to NumPy array
-  return py::array_t<double>(result.size(), result.data());
+py::array ifdct(
+    py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
+    // Request a buffer from the input array
+    py::buffer_info buf = in_arr.request();
+    if (buf.ndim != 1) {
+        throw std::runtime_error("Input array must be 1-dimensional.");
+    }
+
+    // Convert input NumPy array to std::vector
+    size_t N = buf.shape[0];
+    // std::vector<double> input_data(static_cast<double *>(buf.ptr),
+    //                               static_cast<double *>(buf.ptr) + N);
+    ComplexArray input_data(N);
+    std::copy(static_cast<double *>(buf.ptr),
+              static_cast<double *>(buf.ptr) + N, input_data.get_real_ptr());
+
+    // Perform IDCT
+    double *result = new double[N];
+    ifdct_inner(input_data, result, simd);
+    // Convert result back to NumPy array
+    return py::array_t<double>(N, result);
 }
diff --git a/src/fdct_inner.cpp b/src/fdct_inner.cpp
index bc609dd..4b717dd 100644
--- a/src/fdct_inner.cpp
+++ b/src/fdct_inner.cpp
@@ -1,7 +1,4 @@
 
-#pragma once
-#include "fdct_inner.hpp"
-
 #include <fftw3.h>
 
 #include <cmath>
@@ -11,6 +8,7 @@
 #include <vector>
 
 #include "complex_array.hpp"
+#include "fdct_inner.hpp"
 #include "fft.hpp"
 
 // Function to compute the Fast Cosine Transform
@@ -61,116 +59,48 @@ ComplexArray reorderOutput(const ComplexArray &v) {
     return x;
 }
 
-ComplexArray idctPreprocess(const ComplexArray &x) {
+void idctPreprocess(ComplexArray &x) {
     size_t N = x.size();
-    ComplexArray u(N);
-    ComplexArray v(N);
-    v = x;
-    v.get_real_ptr()[0] /= 2;
-    // for (size_t n = 0; n < N; ++n) {
-    //     // Compute the complex term
-    //     Complex W = std::polar(1.0, -M_PI * n / (2 * N));
-
-    //     // Compute u[n]
-    //     u[n] = 0.5 * (x[n] - x[N - n - 1] * Complex(0, 1)) * W;
-    // }
-    
+    auto real = x.get_real_ptr();
+    auto imag = x.get_imag_ptr();
+
+    real[0] /= 2;
     for (size_t n = 0; n < N; ++n) {
         double tmp = n * M_PI / (2 * N);
-        u.get_real_ptr()[n] = v[n].real() * cos(tmp);
-        u.get_imag_ptr()[n] = -v[n].real() * sin(tmp);
+        double tmp_real = real[n];
+        real[n] = tmp_real * cos(tmp);
+        imag[n] = -tmp_real * sin(tmp);
     }
-    return u;
 }
 
 
-std::vector<double> fct_inner(ComplexArray &x) {
+void fct_inner(const ComplexArray &x, double *result, bool simd) {
     size_t N = x.size();
-    std::vector<double> dct(N);
-    auto u = reorderInput(x);
-    ComplexArray fft_in(N);
-    std::vector<std::complex<double> > vc;
-    // Fill FFT input array
-    for (size_t n = 0; n < N; ++n) {
-        vc.push_back(
-            std::complex<double>(u.get_real_ptr()[n], u.get_imag_ptr()[n]));
-        fft_in.get_real_ptr()[n] = u.get_real_ptr()[n];
-        fft_in.get_imag_ptr()[n] = u.get_imag_ptr()[n];
-    }
-
-    // use fftw3
-    // fftw_complex *in, *out;
-    // fftw_plan p;
-    // in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
-    // out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
-    // for (size_t i = 0; i < N; i++) {
-    //   in[i][0] = fft_in.get_real_ptr()[i];
-    //   in[i][1] = fft_in.get_imag_ptr()[i];
-    // }
-    // p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
-    // fftw_execute(p);
-    // for (size_t i = 0; i < N; i++) {
-    //   fft_in.get_real_ptr()[i] = out[i][0];
-    //   fft_in.get_imag_ptr()[i] = out[i][1];
-    // }
-    // fftw_destroy_plan(p);
-    // fftw_free(in);
-    // fftw_free(out);
+    auto fft_in = reorderInput(x);
 
     // Perform FFT
     fft::transform(fft_in, false);
-    // std::vector<std::complex<double> > fft_in_p;
-    // for (size_t n = 0; n < N; ++n) {
-    //   fft_in_p.push_back(std::complex<double>(fft_in.get_real_ptr()[n],
-    //   fft_in.get_imag_ptr()[n]));
-    // }
-    // Fft::transform(fft_in_p, false);
-    // for (size_t n = 0; n < N; ++n) {
-    //   fft_in.get_real_ptr()[n] = fft_in_p[n].real();
-    //   fft_in.get_imag_ptr()[n] = fft_in_p[n].imag();
-    // }
 
     // Compute the DCT using the FFT output
+    double *x_real = fft_in.get_real_ptr();
+    double *x_imag = fft_in.get_imag_ptr();
     for (size_t k = 0; k < N; ++k) {
-        double real = fft_in[k].real();
-        double imag = fft_in[k].imag();
-        dct[k] =
-            (real * cos(M_PI * k / (2 * N)) - imag * sin(-M_PI * k / (2 * N))) *
+        result[k] =
+            (x_real[k] * cos(M_PI * k / (2 * N)) - x_imag[k] * sin(-M_PI * k / (2 * N))) *
             2;
     }
-    return dct;
 }
 
-std::vector<double> ifdct_inner(ComplexArray &c) {
+void ifdct_inner(const ComplexArray &c, double *result, bool simd) {
     size_t N = c.size();
-    std::vector<double> x(N);
-    // auto u = reorderInput(c);
-    auto u = idctPreprocess(c);
-    ComplexArray fft_in(N);
-    std::vector<std::complex<double> > vc;
-    // Fill FFT input array
-    for (size_t n = 0; n < N; ++n) {
-        vc.push_back(
-            std::complex<double>(u.get_real_ptr()[n], u.get_imag_ptr()[n]));
-        fft_in.get_real_ptr()[n] = u.get_real_ptr()[n];
-        fft_in.get_imag_ptr()[n] = u.get_imag_ptr()[n];
-    }
+    ComplexArray fft_in = c;
+    idctPreprocess(fft_in);
 
     // Perform FFT
     fft::transform(fft_in, false);
 
     auto x_ = reorderOutput(fft_in);
     for (size_t n = 0; n < N; ++n) {
-        x[n] = x_.get_real_ptr()[n] / N;
+        result[n] = x_.get_real_ptr()[n] / N;
     }
-    
-    // Compute the IDCT using the FFT output
-    // for (size_t k = 0; k < N; ++k) {
-    //     double real = fft_in[k].real();
-    //     double imag = fft_in[k].imag();
-    //     x[k] = (real * cos(M_PI * k / (2.0 * N)) +
-    //             imag * sin(M_PI * k / (2.0 * N))) /
-    //            N;
-    // }
-    return x;
 }
diff --git a/src/fdct_inner.hpp b/src/fdct_inner.hpp
index 76e84f6..38dd6a3 100644
--- a/src/fdct_inner.hpp
+++ b/src/fdct_inner.hpp
@@ -12,6 +12,6 @@
 
 ComplexArray reorderInput(const ComplexArray &x);
 
-std::vector<double> fct_inner(ComplexArray &x);
+void fct_inner(const ComplexArray &x, double *result, bool simd);
 
-std::vector<double> ifdct_inner(ComplexArray &c);
+void ifdct_inner(const ComplexArray &c, double *result, bool simd);
diff --git a/src/fft.cpp b/src/fft.cpp
index 08f0525..0e8bfb0 100644
--- a/src/fft.cpp
+++ b/src/fft.cpp
@@ -12,7 +12,19 @@
 
 static size_t reverseBits(size_t val, int width);
 
-void fft::transformRadix2(ComplexArray &x, bool inverse) {
+static ComplexArray mkExpTable(size_t n, bool inverse) {
+  ComplexArray expTable(n);
+  double *real = expTable.get_real_ptr();
+  double *imag = expTable.get_imag_ptr();
+  for (size_t i = 0; i < n; i++) {
+    double angle = (inverse ? 2 : -2) * M_PI * i / n;
+    real[i] = std::cos(angle);
+    imag[i] = std::sin(angle);
+  }
+  return expTable;
+}
+
+void fft::transformRadix2(ComplexArray &x, bool inverse, bool simd) {
   int levels = 0;
   size_t n = x.size();
   double *real = x.get_real_ptr();
@@ -23,12 +35,8 @@ void fft::transformRadix2(ComplexArray &x, bool inverse) {
   if (static_cast<size_t>(1U) << levels != n)
     throw std::domain_error("Length is not a power of 2");
 
-  ComplexArray expTable(n / 2);
+  ComplexArray expTable = mkExpTable(n, inverse);
 
-  for (size_t i = 0; i < n / 2; i++) {
-    double angle = (inverse ? 2 : -2) * M_PI * i / n;
-    expTable[i] = std::polar(1.0, angle);
-  }
 
   for (size_t i = 0; i < n; i++) {
     size_t j = reverseBits(i, levels);
@@ -61,7 +69,7 @@ void fft::transformRadix2(ComplexArray &x, bool inverse) {
   }
 }
 
-void fft::transformBluestein(ComplexArray &x, bool inverse) {
+void fft::transformBluestein(ComplexArray &x, bool inverse, bool simd) {
   size_t n = x.size();
   double *real = x.get_real_ptr();
   double *imag = x.get_imag_ptr();
@@ -104,40 +112,40 @@ void fft::transformBluestein(ComplexArray &x, bool inverse) {
     imag_bvec[i] = imag_bvec[m - i] = -imag_expTable[i];
   }
   
-  ComplexArray c = convolove(a, b);
+  convolove(a, b, simd);
   // x = c * expTable;
   for (size_t i = 0; i < n; i++) {
-    real[i] = c.get_real_ptr()[i] * real_expTable[i] - c.get_imag_ptr()[i] * imag_expTable[i];
-    imag[i] = c.get_real_ptr()[i] * imag_expTable[i] + c.get_imag_ptr()[i] * real_expTable[i];
+    real[i] = a.get_real_ptr()[i] * real_expTable[i] - a.get_imag_ptr()[i] * imag_expTable[i];
+    imag[i] = a.get_real_ptr()[i] * imag_expTable[i] + a.get_imag_ptr()[i] * real_expTable[i];
   }
 }
 
-ComplexArray fft::convolove(const ComplexArray &_x, const ComplexArray &_y) {
+
+// compute the convolution of two complex arrays and set the result to x
+void fft::convolove(ComplexArray &_x, const ComplexArray &_y, bool simd) {
   size_t n = _x.size();
   if (n != _y.size())
     throw std::domain_error("Mismatched lengths");
   
-  ComplexArray x = _x;
+  ComplexArray& x = _x;
   ComplexArray y = _y;
 
-  transform(x, false);
-  transform(y, false);
+  transform(x, false, simd);
+  transform(y, false, simd);
 
-  x.multiply(y);
-  transform(x, true);
-  x.scale(1.0 / n);
-  
-  return x;
+  x.multiply(y, simd);
+  transform(x, true, simd);
+  x.scale(1.0 / n, simd);
 }
 
-void fft::transform(ComplexArray &x, bool inverse) {
+void fft::transform(ComplexArray &x, bool inverse, bool simd) {
   size_t n = x.size();
   if (n == 0)
     return;
   else if ((n & (n - 1)) == 0) // Is power of 2
-    transformRadix2(x, inverse);
+    transformRadix2(x, inverse, simd);
   else // More complicated algorithm for arbitrary sizes
-    transformBluestein(x, inverse);
+    transformBluestein(x, inverse, simd);
 }
 
 static size_t reverseBits(size_t val, int width) {
diff --git a/src/fft.hpp b/src/fft.hpp
index 768a7fe..868eee7 100644
--- a/src/fft.hpp
+++ b/src/fft.hpp
@@ -3,13 +3,11 @@
 #include "complex_array.hpp"
 
 namespace fft {
-void transformRadix2(ComplexArray &x, bool inverse);
+void transformRadix2(ComplexArray &x, bool inverse, bool simd = false);
 
-void transformBluestein(ComplexArray &x, bool inverse);
+void transformBluestein(ComplexArray &x, bool inverse, bool simd = false);
 
-void transform(ComplexArray &x, bool inverse);
+void transform(ComplexArray &x, bool inverse, bool simd = false);
 
-// ComplexArray convolove(const ComplexArray &x, const ComplexArray &y);
-ComplexArray convolove(const ComplexArray &x, const ComplexArray &y);
+void convolove(ComplexArray &x, const ComplexArray &y, bool simd = false);
 } // namespace fft
-
diff --git a/test_dct.py b/test_dct.py
index 40a233f..092c6f7 100644
--- a/test_dct.py
+++ b/test_dct.py
@@ -7,6 +7,7 @@
 # Parameters
 test_size_seq = [10, 64, 127, 256, 1023, 2048, 4097, 8192, 8193]
 tolerance = 1e-6  # Tolerance for individual element comparison
+simd = [False, True]  # SIMD flag
 
 # List of test functions and their corresponding references
 test_cases = [
@@ -18,8 +19,10 @@
 
 @pytest.mark.parametrize("size", test_size_seq)
 @pytest.mark.parametrize("name, func, ref_func", test_cases)
-def test_transform(name, func, ref_func, size):
+@pytest.mark.parametrize("simd", simd)
+def test_transform(name, func, ref_func, size, simd):
     inp = np.random.rand(size)
     gt = ref_func(inp)
-    ans = func(inp)
+    ans = func(inp, simd)
     np.testing.assert_allclose(ans, gt, atol=tolerance, rtol=0)
+

From c80074e553034802c8a709b4a437bfd0c8aa7c94 Mon Sep 17 00:00:00 2001
From: cfmc30 <cfmc.cs13@nycu.edu.tw>
Date: Sat, 14 Dec 2024 13:48:42 +0000
Subject: [PATCH 14/16] fix: Use ubuntu 24.04

---
 .github/workflows/cmake-single-platform.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cmake-single-platform.yml b/.github/workflows/cmake-single-platform.yml
index ca27d04..66b348d 100644
--- a/.github/workflows/cmake-single-platform.yml
+++ b/.github/workflows/cmake-single-platform.yml
@@ -17,7 +17,7 @@ jobs:
     # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
     # You can convert this to a matrix build if you need cross-platform coverage.
     # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     strategy:
       fail-fast: false
 
@@ -29,7 +29,8 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        sudo apt install python3-pybind11 libfftw3-dev
+        sudo apt -y update
+        sudo apt -y install python3-pybind11 libfftw3-dev cmake python3-dev
         python -m pip install --upgrade pip
         python -m pip install pytest
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi

From 10a359e3e4445ff9fc53149efb73b6b400f0849d Mon Sep 17 00:00:00 2001
From: Colorfulmooncat <cfmc.cs13@nycu.edu.tw>
Date: Sun, 15 Dec 2024 02:52:43 +0800
Subject: [PATCH 15/16] doc: Add nayuki's website

---
 ref/README.org | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ref/README.org b/ref/README.org
index 74fd6b9..2befb38 100644
--- a/ref/README.org
+++ b/ref/README.org
@@ -2,4 +2,6 @@
 
 This reference implementation is provided by Nayuki. I greatly appreciate their contribution.
 
-For more details, please visit Nayuki's [[Website][https://www.nayuki.io/]]
+For more details, please visit Nayuki's website
+- https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+- https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms

From 569d74b3e5d57361863829ff3c475c5ecb62e218 Mon Sep 17 00:00:00 2001
From: Colorfulmooncat <cfmc.cs13@nycu.edu.tw>
Date: Sun, 15 Dec 2024 02:56:48 +0800
Subject: [PATCH 16/16] doc: fix typo and add reference

---
 README.org | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.org b/README.org
index e1782c1..6ed7575 100644
--- a/README.org
+++ b/README.org
@@ -43,7 +43,7 @@ modern multicore processors.
 :PROPERTIES:
 :CUSTOM_ID: prospective-users
 :END:
-FDCP will be valuable in fieds where DCT is used extensively:
+FDCT will be valuable in fieds where DCT is used extensively:
 
 - Image processing and compression
 - Audio processing and compression
@@ -136,3 +136,7 @@ multithread=True, simd=True)
    [[https://coehuman.uodiyala.edu.iq/uploads/Coehuman%20library%20pdf/%D9%83%D8%AA%D8%A8%20%D8%A7%D9%84%D8%B1%D9%8A%D8%A7%D8%B6%D9%8A%D8%A7%D8%AA%20Mathematics%20books/Wavelets/25%20(2).pdf]]
 3. C. W. Kok, Fast algorithm for computing discrete cosine transform
    [[https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=558495]]
+4. Nayuki, Fast discrete cosine transform algorithms
+   [[https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms]]
+5. Nayuki, Free small FFT in multiple languages
+   [[https://www.nayuki.io/page/free-small-fft-in-multiple-languages]]