diff --git a/.github/workflows/cmake-single-platform.yml b/.github/workflows/cmake-single-platform.yml
index 5012e2a..66b348d 100644
--- a/.github/workflows/cmake-single-platform.yml
+++ b/.github/workflows/cmake-single-platform.yml
@@ -17,7 +17,7 @@ jobs:
     # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
     # You can convert this to a matrix build if you need cross-platform coverage.
     # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     strategy:
       fail-fast: false
 
@@ -29,7 +29,8 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        sudo apt install python3-pybind11
+        sudo apt -y update
+        sudo apt -y install python3-pybind11 libfftw3-dev cmake python3-dev
         python -m pip install --upgrade pip
         python -m pip install pytest
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
diff --git a/.gitignore b/.gitignore
index 9bf3746..374071e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,6 @@
 compile_commands.json
+build/
+.vscode/
+.cache/
+*.so
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 846483e..4afb28a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,16 @@ include_directories(${PYTHON_INCLUDE_DIRS})
 # Find Pybind11
 find_package(pybind11 REQUIRED)
 
-pybind11_add_module(fdct src/fdct.cpp)
+find_package(PkgConfig REQUIRED)
+pkg_search_module(FFTW REQUIRED fftw3 IMPORTED_TARGET)
+include_directories(PkgConfig::FFTW)
+link_libraries(PkgConfig::FFTW)
+
+pybind11_add_module(fdct 
+    src/fdct_module.cpp
+    src/fft.cpp
+    src/fdct_inner.cpp
+)
 
 # Link the module with Python
 target_link_libraries(fdct PRIVATE ${PYTHON_LIBRARIES})
diff --git a/ref/FastDctFft.hpp b/ref/FastDctFft.hpp
new file mode 100644
index 0000000..624234d
--- /dev/null
+++ b/ref/FastDctFft.hpp
@@ -0,0 +1,36 @@
+/* 
+ * Fast discrete cosine transform algorithms (C++)
+ * 
+ * Copyright (c) 2017 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#pragma once
+
+#include <vector>
+
+
+namespace FastDctFft {
+	
+	void transform(std::vector<double> &vec);
+	
+	void inverseTransform(std::vector<double> &vec);
+	
+}
+
diff --git a/ref/FastDctFtt.cpp b/ref/FastDctFtt.cpp
new file mode 100644
index 0000000..5acfa96
--- /dev/null
+++ b/ref/FastDctFtt.cpp
@@ -0,0 +1,75 @@
+/* 
+ * Fast discrete cosine transform algorithms (C++)
+ * 
+ * Copyright (c) 2017 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include "FastDctFft.hpp"
+#include "FftRealPair.hpp"
+
+using std::size_t;
+using std::vector;
+
+
+// DCT type II, unscaled
+void FastDctFft::transform(vector<double> &vec) {
+	size_t len = vec.size();
+	size_t halfLen = len / 2;
+	vector<double> real(len);
+	for (size_t i = 0; i < halfLen; i++) {
+		real.at(i) = vec.at(i * 2);
+		real.at(len - 1 - i) = vec.at(i * 2 + 1);
+	}
+	if (len % 2 == 1)
+		real.at(halfLen) = vec.at(len - 1);
+	std::fill(vec.begin(), vec.end(), 0.0);
+	Fft::transform(real, vec);
+	for (size_t i = 0; i < len; i++) {
+		double temp = i * M_PI / (len * 2);
+		vec.at(i) = real.at(i) * std::cos(temp) + vec.at(i) * std::sin(temp);
+	}
+}
+
+
+// DCT type III, unscaled
+void FastDctFft::inverseTransform(vector<double> &vec) {
+	size_t len = vec.size();
+	if (len > 0)
+		vec.at(0) /= 2;
+	vector<double> real(len);
+	for (size_t i = 0; i < len; i++) {
+		double temp = i * M_PI / (len * 2);
+		real.at(i) = vec.at(i) * std::cos(temp);
+		vec.at(i) *= -std::sin(temp);
+	}
+	Fft::transform(real, vec);
+	
+	size_t halfLen = len / 2;
+	for (size_t i = 0; i < halfLen; i++) {
+		vec.at(i * 2 + 0) = real.at(i);
+		vec.at(i * 2 + 1) = real.at(len - 1 - i);
+	}
+	if (len % 2 == 1)
+		vec.at(len - 1) = real.at(halfLen);
+}
+
diff --git a/ref/FftComplex.cpp b/ref/FftComplex.cpp
new file mode 100644
index 0000000..4f842a0
--- /dev/null
+++ b/ref/FftComplex.cpp
@@ -0,0 +1,149 @@
+/* 
+ * Free FFT and convolution (C++)
+ * 
+ * Copyright (c) 2021 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <utility>
+#include "FftComplex.hpp"
+
+using std::complex;
+using std::size_t;
+using std::uintmax_t;
+using std::vector;
+
+
+// Private function prototypes
+static size_t reverseBits(size_t val, int width);
+
+
+void Fft::transform(vector<complex<double> > &vec, bool inverse) {
+	size_t n = vec.size();
+	if (n == 0)
+		return;
+	else if ((n & (n - 1)) == 0)  // Is power of 2
+		transformRadix2(vec, inverse);
+	else  // More complicated algorithm for arbitrary sizes
+		transformBluestein(vec, inverse);
+}
+
+
+void Fft::transformRadix2(vector<complex<double> > &vec, bool inverse) {
+	// Length variables
+	size_t n = vec.size();
+	int levels = 0;  // Compute levels = floor(log2(n))
+	for (size_t temp = n; temp > 1U; temp >>= 1)
+		levels++;
+	if (static_cast<size_t>(1U) << levels != n)
+		throw std::domain_error("Length is not a power of 2");
+	
+	// Trigonometric table
+	vector<complex<double> > expTable(n / 2);
+	for (size_t i = 0; i < n / 2; i++)
+		expTable[i] = std::polar(1.0, (inverse ? 2 : -2) * M_PI * i / n);
+	
+	// Bit-reversed addressing permutation
+	for (size_t i = 0; i < n; i++) {
+		size_t j = reverseBits(i, levels);
+		if (j > i)
+			std::swap(vec[i], vec[j]);
+	}
+	
+	// Cooley-Tukey decimation-in-time radix-2 FFT
+	for (size_t size = 2; size <= n; size *= 2) {
+		size_t halfsize = size / 2;
+		size_t tablestep = n / size;
+		for (size_t i = 0; i < n; i += size) {
+			for (size_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) {
+				complex<double> temp = vec[j + halfsize] * expTable[k];
+				vec[j + halfsize] = vec[j] - temp;
+				vec[j] += temp;
+			}
+		}
+		if (size == n)  // Prevent overflow in 'size *= 2'
+			break;
+	}
+}
+
+
+void Fft::transformBluestein(vector<complex<double> > &vec, bool inverse) {
+	// Find a power-of-2 convolution length m such that m >= n * 2 + 1
+	size_t n = vec.size();
+	size_t m = 1;
+	while (m / 2 <= n) {
+		if (m > SIZE_MAX / 2)
+			throw std::length_error("Vector too large");
+		m *= 2;
+	}
+	
+	// Trigonometric table
+	vector<complex<double> > expTable(n);
+	for (size_t i = 0; i < n; i++) {
+		uintmax_t temp = static_cast<uintmax_t>(i) * i;
+		temp %= static_cast<uintmax_t>(n) * 2;
+		double angle = (inverse ? M_PI : -M_PI) * temp / n;
+		expTable[i] = std::polar(1.0, angle);
+	}
+	
+	// Temporary vectors and preprocessing
+	vector<complex<double> > avec(m);
+	for (size_t i = 0; i < n; i++)
+		avec[i] = vec[i] * expTable[i];
+	vector<complex<double> > bvec(m);
+	bvec[0] = expTable[0];
+	for (size_t i = 1; i < n; i++)
+		bvec[i] = bvec[m - i] = std::conj(expTable[i]);
+	
+	// Convolution
+	vector<complex<double> > cvec = convolve(std::move(avec), std::move(bvec));
+	
+	// Postprocessing
+	for (size_t i = 0; i < n; i++)
+		vec[i] = cvec[i] * expTable[i];
+}
+
+
+vector<complex<double> > Fft::convolve(
+		vector<complex<double> > xvec,
+		vector<complex<double> > yvec) {
+	
+	size_t n = xvec.size();
+	if (n != yvec.size())
+		throw std::domain_error("Mismatched lengths");
+	transform(xvec, false);
+	transform(yvec, false);
+	for (size_t i = 0; i < n; i++)
+		xvec[i] *= yvec[i];
+	transform(xvec, true);
+	for (size_t i = 0; i < n; i++)  // Scaling (because this FFT implementation omits it)
+		xvec[i] /= static_cast<double>(n);
+	return xvec;
+}
+
+
+static size_t reverseBits(size_t val, int width) {
+	size_t result = 0;
+	for (int i = 0; i < width; i++, val >>= 1)
+		result = (result << 1) | (val & 1U);
+	return result;
+}
diff --git a/ref/FftComplex.hpp b/ref/FftComplex.hpp
new file mode 100644
index 0000000..24eeabb
--- /dev/null
+++ b/ref/FftComplex.hpp
@@ -0,0 +1,61 @@
+/* 
+ * Free FFT and convolution (C++)
+ * 
+ * Copyright (c) 2021 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#pragma once
+
+#include <complex>
+#include <vector>
+using Complex = std::complex<double>;
+
+namespace Fft {
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector can have any length. This is a wrapper function. The inverse transform does not perform scaling, so it is not a true inverse.
+	 */
+	void transform(std::vector<std::complex<double> > &vec, bool inverse);
+	
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector's length must be a power of 2. Uses the Cooley-Tukey decimation-in-time radix-2 algorithm.
+	 */
+	void transformRadix2(std::vector<std::complex<double> > &vec, bool inverse);
+	
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector can have any length. This requires the convolution function, which in turn requires the radix-2 FFT function.
+	 * Uses Bluestein's chirp z-transform algorithm.
+	 */
+	void transformBluestein(std::vector<std::complex<double> > &vec, bool inverse);
+	
+	
+	/* 
+	 * Computes the circular convolution of the given complex vectors. Each vector's length must be the same.
+	 */
+	std::vector<std::complex<double> > convolve(
+		std::vector<std::complex<double> > xvec,
+		std::vector<std::complex<double> > yvec);
+	
+}
diff --git a/ref/FftComplexTest.cpp b/ref/FftComplexTest.cpp
new file mode 100644
index 0000000..dd960d1
--- /dev/null
+++ b/ref/FftComplexTest.cpp
@@ -0,0 +1,177 @@
+/* 
+ * FFT and convolution test (C++)
+ * 
+ * Copyright (c) 2021 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <utility>
+#include <vector>
+#include "FftComplex.hpp"
+
+using std::complex;
+using std::cout;
+using std::endl;
+using std::vector;
+
+
+// Private function prototypes
+static void testFft(int n);
+static void testConvolution(int n);
+static vector<complex<double> > naiveDft(const vector<complex<double> > &input, bool inverse);
+static vector<complex<double> > naiveConvolve(const vector<complex<double> > &xvec, const vector<complex<double> > &yvec);
+static double log10RmsErr(const vector<complex<double> > &xvec, const vector<complex<double> > &yvec);
+static vector<complex<double> > randomComplexes(int n);
+
+// Mutable global variable
+static double maxLogError = -INFINITY;
+
+// Random number generation
+std::default_random_engine randGen((std::random_device())());
+
+
+/*---- Main and test functions ----*/
+
+int main() {
+	// Test power-of-2 size FFTs
+	for (int i = 0; i <= 12; i++)
+		testFft(1 << i);
+	
+	// Test small size FFTs
+	for (int i = 0; i < 30; i++)
+		testFft(i);
+	
+	// Test diverse size FFTs
+	for (int i = 0, prev = 0; i <= 100; i++) {
+		int n = static_cast<int>(std::lround(std::pow(1500.0, i / 100.0)));
+		if (n > prev) {
+			testFft(n);
+			prev = n;
+		}
+	}
+	
+	// Test power-of-2 size convolutions
+	for (int i = 0; i <= 12; i++)
+		testConvolution(1 << i);
+	
+	// Test diverse size convolutions
+	for (int i = 0, prev = 0; i <= 100; i++) {
+		int n = static_cast<int>(std::lround(std::pow(1500.0, i / 100.0)));
+		if (n > prev) {
+			testConvolution(n);
+			prev = n;
+		}
+	}
+	
+	cout << endl;
+	cout << "Max log err = " << std::setprecision(3) << maxLogError << endl;
+	cout << "Test " << (maxLogError < -10 ? "passed" : "failed") << endl;
+	return EXIT_SUCCESS;
+}
+
+
+static void testFft(int n) {
+	const vector<complex<double> > input = randomComplexes(n);
+	const vector<complex<double> > expect = naiveDft(input, false);
+	vector<complex<double> > actual = input;
+	Fft::transform(actual, false);
+	double err = log10RmsErr(expect, actual);
+	
+	for (auto it = actual.begin(); it != actual.end(); ++it)
+		*it /= n;
+	Fft::transform(actual, true);
+	err = std::max(log10RmsErr(input, actual), err);
+	cout << "fftsize=" << std::setw(4) << std::setfill(' ') << n << "  "
+	     << "logerr=" << std::setw(5) << std::setprecision(3) << std::setiosflags(std::ios::showpoint)
+	     << err << endl;
+}
+
+
+static void testConvolution(int n) {
+	const vector<complex<double> > input0 = randomComplexes(n);
+	const vector<complex<double> > input1 = randomComplexes(n);
+	const vector<complex<double> > expect = naiveConvolve(input0, input1);
+	const vector<complex<double> > actual = Fft::convolve(std::move(input0), std::move(input1));
+	cout << "convsize=" << std::setw(4) << std::setfill(' ') << n << "  "
+	     << "logerr=" << std::setw(5) << std::setprecision(3) << std::setiosflags(std::ios::showpoint)
+	     << log10RmsErr(expect, actual) << endl;
+}
+
+
+/*---- Naive reference computation functions ----*/
+
+static vector<complex<double> > naiveDft(const vector<complex<double> > &input, bool inverse) {
+	int n = static_cast<int>(input.size());
+	vector<complex<double> > output;
+	double coef = (inverse ? 2 : -2) * M_PI / n;
+	for (int k = 0; k < n; k++) {  // For each output element
+		complex<double> sum(0);
+		for (int t = 0; t < n; t++) {  // For each input element
+			double angle = coef * (static_cast<long long>(t) * k % n);
+			sum += input[t] * std::polar(1.0, angle);
+		}
+		output.push_back(sum);
+	}
+	return output;
+}
+
+
+static vector<complex<double> > naiveConvolve(
+		const vector<complex<double> > &xvec, const vector<complex<double> > &yvec) {
+	int n = static_cast<int>(xvec.size());
+	vector<complex<double> > result(n);  // All zeros
+	for (int i = 0; i < n; i++) {
+		for (int j = 0; j < n; j++) {
+			int k = (i + j) % n;
+			result[k] += xvec[i] * yvec[j];
+		}
+	}
+	return result;
+}
+
+
+/*---- Utility functions ----*/
+
+static double log10RmsErr(const vector<complex<double> > &xvec, const vector<complex<double> > &yvec) {
+	int n = static_cast<int>(xvec.size());
+	double err = std::pow(10, -99 * 2);
+	for (int i = 0; i < n; i++)
+		err += std::norm(xvec.at(i) - yvec.at(i));
+	err /= n > 0 ? n : 1;
+	err = std::sqrt(err);  // Now this is a root mean square (RMS) error
+	err = std::log10(err);
+	maxLogError = std::max(err, maxLogError);
+	return err;
+}
+
+
+static vector<complex<double> > randomComplexes(int n) {
+	std::uniform_real_distribution<double> valueDist(-1.0, 1.0);
+	vector<complex<double> > result;
+	for (int i = 0; i < n; i++)
+		result.push_back(complex<double>(valueDist(randGen), valueDist(randGen)));
+	return result;
+}
diff --git a/ref/FftRealPair.cpp b/ref/FftRealPair.cpp
new file mode 100644
index 0000000..9bb414f
--- /dev/null
+++ b/ref/FftRealPair.cpp
@@ -0,0 +1,195 @@
+/* 
+ * Free FFT and convolution (C++)
+ * 
+ * Copyright (c) 2021 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include "FftRealPair.hpp"
+
+using std::size_t;
+using std::uintmax_t;
+using std::vector;
+
+
+// Private function prototypes
+static size_t reverseBits(size_t val, int width);
+
+
+void Fft::transform(vector<double> &real, vector<double> &imag) {
+	size_t n = real.size();
+	if (n != imag.size())
+		throw std::invalid_argument("Mismatched lengths");
+	if (n == 0)
+		return;
+	else if ((n & (n - 1)) == 0)  // Is power of 2
+		transformRadix2(real, imag);
+	else  // More complicated algorithm for arbitrary sizes
+		transformBluestein(real, imag);
+}
+
+
+void Fft::inverseTransform(vector<double> &real, vector<double> &imag) {
+	transform(imag, real);
+}
+
+
+void Fft::transformRadix2(vector<double> &real, vector<double> &imag) {
+	// Length variables
+	size_t n = real.size();
+	if (n != imag.size())
+		throw std::invalid_argument("Mismatched lengths");
+	int levels = 0;  // Compute levels = floor(log2(n))
+	for (size_t temp = n; temp > 1U; temp >>= 1)
+		levels++;
+	if (static_cast<size_t>(1U) << levels != n)
+		throw std::domain_error("Length is not a power of 2");
+	
+	// Trigonometric tables
+	vector<double> cosTable(n / 2);
+	vector<double> sinTable(n / 2);
+	for (size_t i = 0; i < n / 2; i++) {
+		cosTable[i] = std::cos(2 * M_PI * i / n);
+		sinTable[i] = std::sin(2 * M_PI * i / n);
+	}
+	
+	// Bit-reversed addressing permutation
+	for (size_t i = 0; i < n; i++) {
+		size_t j = reverseBits(i, levels);
+		if (j > i) {
+			std::swap(real[i], real[j]);
+			std::swap(imag[i], imag[j]);
+		}
+	}
+	
+	// Cooley-Tukey decimation-in-time radix-2 FFT
+	for (size_t size = 2; size <= n; size *= 2) {
+		size_t halfsize = size / 2;
+		size_t tablestep = n / size;
+		for (size_t i = 0; i < n; i += size) {
+			for (size_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) {
+				size_t l = j + halfsize;
+				double tpre =  real[l] * cosTable[k] + imag[l] * sinTable[k];
+				double tpim = -real[l] * sinTable[k] + imag[l] * cosTable[k];
+				real[l] = real[j] - tpre;
+				imag[l] = imag[j] - tpim;
+				real[j] += tpre;
+				imag[j] += tpim;
+			}
+		}
+		if (size == n)  // Prevent overflow in 'size *= 2'
+			break;
+	}
+}
+
+
+void Fft::transformBluestein(vector<double> &real, vector<double> &imag) {
+	// Find a power-of-2 convolution length m such that m >= n * 2 + 1
+	size_t n = real.size();
+	if (n != imag.size())
+		throw std::invalid_argument("Mismatched lengths");
+	size_t m = 1;
+	while (m / 2 <= n) {
+		if (m > SIZE_MAX / 2)
+			throw std::length_error("Vector too large");
+		m *= 2;
+	}
+	
+	// Trigonometric tables
+	vector<double> cosTable(n), sinTable(n);
+	for (size_t i = 0; i < n; i++) {
+		uintmax_t temp = static_cast<uintmax_t>(i) * i;
+		temp %= static_cast<uintmax_t>(n) * 2;
+		double angle = M_PI * temp / n;
+		cosTable[i] = std::cos(angle);
+		sinTable[i] = std::sin(angle);
+	}
+	
+	// Temporary vectors and preprocessing
+	vector<double> areal(m), aimag(m);
+	for (size_t i = 0; i < n; i++) {
+		areal[i] =  real[i] * cosTable[i] + imag[i] * sinTable[i];
+		aimag[i] = -real[i] * sinTable[i] + imag[i] * cosTable[i];
+	}
+	vector<double> breal(m), bimag(m);
+	breal[0] = cosTable[0];
+	bimag[0] = sinTable[0];
+	for (size_t i = 1; i < n; i++) {
+		breal[i] = breal[m - i] = cosTable[i];
+		bimag[i] = bimag[m - i] = sinTable[i];
+	}
+	
+	// Convolution
+	std::pair<vector<double>, vector<double> > cvec = convolve(areal, aimag, breal, bimag);
+	vector<double> creal = std::move(cvec.first );
+	vector<double> cimag = std::move(cvec.second);
+	
+	// Postprocessing
+	for (size_t i = 0; i < n; i++) {
+		real[i] =  creal[i] * cosTable[i] + cimag[i] * sinTable[i];
+		imag[i] = -creal[i] * sinTable[i] + cimag[i] * cosTable[i];
+	}
+}
+
+
+vector<double> Fft::convolve(vector<double> xvec, vector<double> yvec) {
+	size_t n = xvec.size();
+	if (n != yvec.size())
+		throw std::invalid_argument("Mismatched lengths");
+	return convolve(std::move(xvec), vector<double>(n), std::move(yvec), vector<double>(n)).first;
+}
+
+
+std::pair<vector<double>, vector<double> > Fft::convolve(
+		vector<double> xreal, vector<double> ximag,
+		vector<double> yreal, vector<double> yimag) {
+	
+	size_t n = xreal.size();
+	if (n != ximag.size() || n != yreal.size() || n != yimag.size())
+		throw std::invalid_argument("Mismatched lengths");
+	
+	transform(xreal, ximag);
+	transform(yreal, yimag);
+	
+	for (size_t i = 0; i < n; i++) {
+		double temp = xreal[i] * yreal[i] - ximag[i] * yimag[i];
+		ximag[i] = ximag[i] * yreal[i] + xreal[i] * yimag[i];
+		xreal[i] = temp;
+	}
+	inverseTransform(xreal, ximag);
+	
+	for (size_t i = 0; i < n; i++) {  // Scaling (because this FFT implementation omits it)
+		xreal[i] /= n;
+		ximag[i] /= n;
+	}
+	return std::pair<vector<double>, vector<double> >(std::move(xreal), std::move(ximag));
+}
+
+
+static size_t reverseBits(size_t val, int width) {
+	size_t result = 0;
+	for (int i = 0; i < width; i++, val >>= 1)
+		result = (result << 1) | (val & 1U);
+	return result;
+}
+
diff --git a/ref/FftRealPair.hpp b/ref/FftRealPair.hpp
new file mode 100644
index 0000000..95db10a
--- /dev/null
+++ b/ref/FftRealPair.hpp
@@ -0,0 +1,75 @@
+/* 
+ * Free FFT and convolution (C++)
+ * 
+ * Copyright (c) 2021 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ *   all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ *   implied, including but not limited to the warranties of merchantability,
+ *   fitness for a particular purpose and noninfringement. In no event shall the
+ *   authors or copyright holders be liable for any claim, damages or other
+ *   liability, whether in an action of contract, tort or otherwise, arising from,
+ *   out of or in connection with the Software or the use or other dealings in the
+ *   Software.
+ */
+
+#pragma once
+
+#include <utility>
+#include <vector>
+
+
+namespace Fft {
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector can have any length. This is a wrapper function.
+	 */
+	void transform(std::vector<double> &real, std::vector<double> &imag);
+	
+	
+	/* 
+	 * Computes the inverse discrete Fourier transform (IDFT) of the given complex vector, storing the result back into the vector.
+	 * The vector can have any length. This is a wrapper function. This transform does not perform scaling, so the inverse is not a true inverse.
+	 */
+	void inverseTransform(std::vector<double> &real, std::vector<double> &imag);
+	
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector's length must be a power of 2. Uses the Cooley-Tukey decimation-in-time radix-2 algorithm.
+	 */
+	void transformRadix2(std::vector<double> &real, std::vector<double> &imag);
+	
+	
+	/* 
+	 * Computes the discrete Fourier transform (DFT) of the given complex vector, storing the result back into the vector.
+	 * The vector can have any length. This requires the convolution function, which in turn requires the radix-2 FFT function.
+	 * Uses Bluestein's chirp z-transform algorithm.
+	 */
+	void transformBluestein(std::vector<double> &real, std::vector<double> &imag);
+	
+	
+	/* 
+	 * Computes the circular convolution of the given real vectors. Each vector's length must be the same.
+	 */
+	std::vector<double> convolve(std::vector<double> xvec, std::vector<double> yvec);
+	
+	
+	/* 
+	 * Computes the circular convolution of the given complex vectors. Each vector's length must be the same.
+	 */
+	std::pair<std::vector<double>, std::vector<double> > convolve(
+		std::vector<double> xreal, std::vector<double> ximag,
+		std::vector<double> yreal, std::vector<double> yimag);
+	
+}
+
diff --git a/ref/README.org b/ref/README.org
new file mode 100644
index 0000000..74fd6b9
--- /dev/null
+++ b/ref/README.org
@@ -0,0 +1,5 @@
+* Reference Implementation
+
+This reference implementation is provided by Nayuki. I greatly appreciate their contribution.
+
+For more details, please visit Nayuki's [[Website][https://www.nayuki.io/]]
diff --git a/src/complex_array.cpp b/src/complex_array.cpp
new file mode 100644
index 0000000..0bdb5ac
--- /dev/null
+++ b/src/complex_array.cpp
@@ -0,0 +1,181 @@
+#ifndef COMPLEX_ARRAY_HPP
+#define COMPLEX_ARRAY_HPP
+#include <algorithm>
+#include <complex>
+#include <stdexcept>
+
+using Complex = std::complex<double>;
+
+class ComplexProxy {
+    double *real_ptr;
+    double *imag_ptr;
+
+   public:
+    double &real() { return *real_ptr; }
+    double &imag() { return *imag_ptr; }
+
+    ComplexProxy(double *real_ptr, double *imag_ptr)
+        : real_ptr(real_ptr), imag_ptr(imag_ptr) {}
+
+    ComplexProxy(ComplexProxy &&rhs) noexcept
+        : real_ptr(rhs.real_ptr), imag_ptr(rhs.imag_ptr) {
+        rhs.real_ptr = nullptr;
+        rhs.imag_ptr = nullptr;
+    }
+
+    operator Complex() const { return Complex(*real_ptr, *imag_ptr); }
+
+    ComplexProxy &operator=(const Complex &c) {
+        *real_ptr = c.real();
+        *imag_ptr = c.imag();
+        return *this;
+    }
+};
+
+class ComplexArray {
+   private:
+    double *real;
+    double *imag;
+    size_t array_size;
+
+   public:
+    ComplexArray(size_t n) : array_size(n) {
+        if (n == 0) {
+            real = nullptr;
+            imag = nullptr;
+        } else {
+            real = new double[n];
+            std::fill(real, real + n, 0.0);
+            imag = new double[n];
+            std::fill(imag, imag + n, 0.0);
+        }
+    }
+
+    ~ComplexArray() {
+        if (real != nullptr) {
+            delete[] real;
+        }
+        if (imag != nullptr) {
+            delete[] imag;
+        }
+    }
+
+    ComplexArray(const ComplexArray &other) : array_size(other.array_size) {
+        real = new double[array_size];
+        imag = new double[array_size];
+        std::copy(other.real, other.real + array_size, real);
+        std::copy(other.imag, other.imag + array_size, imag);
+    }
+
+    ComplexArray(ComplexArray &&rhs) noexcept
+        : real(rhs.real), imag(rhs.imag), array_size(rhs.array_size) {
+        rhs.real = nullptr;
+        rhs.imag = nullptr;
+        rhs.array_size = 0;
+    }
+
+    ComplexArray &operator=(const ComplexArray &other) {
+        if (this == &other) {
+            return *this;
+        }
+        if (real != nullptr) {
+            delete[] real;
+        }
+        if (imag != nullptr) {
+            delete[] imag;
+        }
+
+        array_size = other.array_size;
+        real = new double[array_size];
+        imag = new double[array_size];
+
+        for (size_t i = 0; i < array_size; i++) {
+            real[i] = other.real[i];
+            imag[i] = other.imag[i];
+        }
+
+        return *this;
+    }
+
+    void multiply(const ComplexArray &other) {
+        if (array_size != other.array_size) {
+            throw std::invalid_argument("Array sizes do not match");
+        }
+
+        for (size_t i = 0; i < array_size; i++) {
+            double temp_real =
+                real[i] * other.real[i] - imag[i] * other.imag[i];
+            double temp_imag =
+                real[i] * other.imag[i] + imag[i] * other.real[i];
+            real[i] = temp_real;
+            imag[i] = temp_imag;
+        }
+    }
+
+    // void partial_multiply(const ComplexArray &other, size_t start, size_t length) {
+        
+    //     size_t end = start + length;
+    //     if (end > array_size || end > other.array_size) {
+    //         throw std::invalid_argument("Array sizes do not match");
+    //     }
+    //     for (size_t i = 0; i < length; i++) {
+    //         double temp_real =
+    //             real[start + i] * other.real[i] - imag[start + i] * other.imag[i];
+    //         double temp_imag =
+    //             real[start + i] * other.imag[i] + imag[start + i] * other.real[i];
+    //         real[start + i] = temp_real;
+    //         imag[start + i] = temp_imag;
+    //     }
+    // }
+
+    void scale(double scalar) {
+        for (size_t i = 0; i < array_size; i++) {
+            real[i] *= scalar;
+            imag[i] *= scalar;
+        }
+    }
+
+    ComplexArray operator*(const ComplexArray &other) const {
+        if (array_size != other.array_size) {
+            throw std::invalid_argument("Array sizes do not match");
+        }
+        ComplexArray result = *this;  // Copy constructor
+        result.multiply(other);
+        return result;
+    }
+
+    ComplexArray operator+(const ComplexArray &other) const {
+        if (array_size != other.array_size) {
+            throw std::invalid_argument("Array sizes do not match");
+        }
+
+        ComplexArray result(array_size);
+        for (size_t i = 0; i < array_size; i++) {
+            result.real[i] = real[i] + other.real[i];
+            result.imag[i] = imag[i] + other.imag[i];
+        }
+        return result;
+    }
+
+    Complex operator[](size_t index) const {
+        return Complex(real[index], imag[index]);
+    }
+
+    ComplexProxy operator[](size_t index) {
+        return ComplexProxy(real + index, imag + index);
+    }
+
+    
+
+    size_t size() const { return array_size; }
+
+    double *get_real_ptr() { return real; }
+
+    const double *get_real_ptr() const { return real; }
+
+    double *get_imag_ptr() { return imag; }
+
+    const double *get_imag_ptr() const { return imag; }
+};
+
+#endif
diff --git a/src/complex_array.hpp b/src/complex_array.hpp
new file mode 100644
index 0000000..911d8fa
--- /dev/null
+++ b/src/complex_array.hpp
@@ -0,0 +1,181 @@
+#ifndef COMPLEX_ARRAY_HPP
+#define COMPLEX_ARRAY_HPP
+#include <algorithm>
+#include <complex>
+#include <stdexcept>
+
+using Complex = std::complex<double>;
+
+class ComplexProxy {
+    double *real_ptr;
+    double *imag_ptr;
+
+   public:
+    double &real() { return *real_ptr; }
+    double &imag() { return *imag_ptr; }
+
+    ComplexProxy(double *real_ptr, double *imag_ptr)
+        : real_ptr(real_ptr), imag_ptr(imag_ptr) {}
+
+    ComplexProxy(ComplexProxy &&rhs) noexcept
+        : real_ptr(rhs.real_ptr), imag_ptr(rhs.imag_ptr) {
+        rhs.real_ptr = nullptr;
+        rhs.imag_ptr = nullptr;
+    }
+
+    operator Complex() const { return Complex(*real_ptr, *imag_ptr); }
+
+    ComplexProxy &operator=(const Complex &c) {
+        *real_ptr = c.real();
+        *imag_ptr = c.imag();
+        return *this;
+    }
+};
+
+class ComplexArray {
+   private:
+    double *real;
+    double *imag;
+    size_t array_size;
+
+   public:
+    ComplexArray(size_t n) : array_size(n) {
+        if (n == 0) {
+            real = nullptr;
+            imag = nullptr;
+        } else {
+            real = new double[n];
+            std::fill(real, real + n, 0.0);
+            imag = new double[n];
+            std::fill(imag, imag + n, 0.0);
+        }
+    }
+
+    ~ComplexArray() {
+        if (real != nullptr) {
+            delete[] real;
+        }
+        if (imag != nullptr) {
+            delete[] imag;
+        }
+    }
+
+    ComplexArray(const ComplexArray &other) : array_size(other.array_size) {
+        real = new double[array_size];
+        imag = new double[array_size];
+        std::copy(other.real, other.real + array_size, real);
+        std::copy(other.imag, other.imag + array_size, imag);
+    }
+
+    ComplexArray(ComplexArray &&rhs) noexcept
+        : real(rhs.real), imag(rhs.imag), array_size(rhs.array_size) {
+        rhs.real = nullptr;
+        rhs.imag = nullptr;
+        rhs.array_size = 0;
+    }
+
+    ComplexArray &operator=(const ComplexArray &other) {
+        if (this == &other) {
+            return *this;
+        }
+        if (real != nullptr) {
+            delete[] real;
+        }
+        if (imag != nullptr) {
+            delete[] imag;
+        }
+
+        array_size = other.array_size;
+        real = new double[array_size];
+        imag = new double[array_size];
+
+        for (size_t i = 0; i < array_size; i++) {
+            real[i] = other.real[i];
+            imag[i] = other.imag[i];
+        }
+
+        return *this;
+    }
+
+    void multiply(const ComplexArray &other, bool simd = false) {
+        if (array_size != other.array_size) {
+            throw std::invalid_argument("Array sizes do not match");
+        }
+
+        for (size_t i = 0; i < array_size; i++) {
+            double temp_real =
+                real[i] * other.real[i] - imag[i] * other.imag[i];
+            double temp_imag =
+                real[i] * other.imag[i] + imag[i] * other.real[i];
+            real[i] = temp_real;
+            imag[i] = temp_imag;
+        }
+    }
+
+    // void partial_multiply(const ComplexArray &other, size_t start, size_t length) {
+        
+    //     size_t end = start + length;
+    //     if (end > array_size || end > other.array_size) {
+    //         throw std::invalid_argument("Array sizes do not match");
+    //     }
+    //     for (size_t i = 0; i < length; i++) {
+    //         double temp_real =
+    //             real[start + i] * other.real[i] - imag[start + i] * other.imag[i];
+    //         double temp_imag =
+    //             real[start + i] * other.imag[i] + imag[start + i] * other.real[i];
+    //         real[start + i] = temp_real;
+    //         imag[start + i] = temp_imag;
+    //     }
+    // }
+
+    void scale(double scalar, bool simd = false) {
+        for (size_t i = 0; i < array_size; i++) {
+            real[i] *= scalar;
+            imag[i] *= scalar;
+        }
+    }
+
+    ComplexArray operator*(const ComplexArray &other) const {
+        if (array_size != other.array_size) {
+            throw std::invalid_argument("Array sizes do not match");
+        }
+        ComplexArray result = *this;  // Copy constructor
+        result.multiply(other);
+        return result;
+    }
+
+    ComplexArray operator+(const ComplexArray &other) const {
+        if (array_size != other.array_size) {
+            throw std::invalid_argument("Array sizes do not match");
+        }
+
+        ComplexArray result(array_size);
+        for (size_t i = 0; i < array_size; i++) {
+            result.real[i] = real[i] + other.real[i];
+            result.imag[i] = imag[i] + other.imag[i];
+        }
+        return result;
+    }
+
+    Complex operator[](size_t index) const {
+        return Complex(real[index], imag[index]);
+    }
+
+    ComplexProxy operator[](size_t index) {
+        return ComplexProxy(real + index, imag + index);
+    }
+
+    
+
+    size_t size() const { return array_size; }
+
+    double *get_real_ptr() { return real; }
+
+    const double *get_real_ptr() const { return real; }
+
+    double *get_imag_ptr() { return imag; }
+
+    const double *get_imag_ptr() const { return imag; }
+};
+
+#endif
diff --git a/src/dct_naive.hpp b/src/dct_naive.hpp
new file mode 100644
index 0000000..07e19fe
--- /dev/null
+++ b/src/dct_naive.hpp
@@ -0,0 +1,70 @@
+#pragma once
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <cmath>
+
+
+namespace py = pybind11;
+#define _USE_MATH_DEFINES
+
+constexpr double SQRT2_1 = 0.7071067811865475;
+constexpr double ONE_SUB_SQRT2_1 = 1 - 0.7071067811865475;
+
+static inline double dct_naive_single(double *buf, size_t buf_size, size_t i, bool simd) {
+  double sum = 0.0f;
+  for (size_t j = 0; j < buf_size; j++) {
+    sum += buf[j] * cos(M_PI / buf_size * i * (j + 0.5));
+  }
+  return sum;
+}
+
+static inline double idct_naive_single(double *buf, size_t buf_size, size_t i) {
+  double sum = 0.0;
+  for (size_t j = 1; j < buf_size; j++) {
+    sum += buf[j] * cos(M_PI * j * (2 * i + 1.0) / (2.0 * buf_size));
+  }
+  return sum;
+}
+
+void dct_naive_inner(double *buf, size_t buf_size, double *out, bool simd) {
+  for (size_t i = 0; i < buf_size; i++) {
+    out[i] = 2.0 * dct_naive_single(buf, buf_size, i, simd);
+  }
+}
+
+void idct_naive_inner(double *buf, size_t buf_size, double *out, bool simd) {
+  for (size_t i = 0; i < buf_size; i++) {
+    out[i] = (buf[0] + (2 * idct_naive_single(buf, buf_size, i))) / (2.0 * buf_size);
+  }
+}
+
+py::array dct_naive(
+  py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
+  py::buffer_info buf_info = in_arr.request();
+  double *ptr = static_cast<double *>(buf_info.ptr);
+  size_t buf_size = buf_info.size;
+
+  py::array_t<double> out_arr(buf_info.shape);
+
+  double *out_buf = static_cast<double *>(out_arr.request(true).ptr);
+  dct_naive_inner(ptr, buf_size, out_buf, simd);
+
+  return out_arr;
+}
+
+py::array idct_naive(
+  py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
+  py::buffer_info buf_info = in_arr.request();
+  double *ptr = static_cast<double *>(buf_info.ptr);
+  size_t buf_size = buf_info.size;
+
+  py::array_t<double> out_arr(buf_info.shape);
+
+  double *out_buf = static_cast<double *>(out_arr.request(true).ptr);
+  idct_naive_inner(ptr, buf_size, out_buf, simd);
+
+  return out_arr;
+}
+
diff --git a/src/fdct.cpp b/src/fdct.cpp
deleted file mode 100644
index 5163348..0000000
--- a/src/fdct.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-#include <cstddef>
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-#include <iostream>
-
-namespace py = pybind11;
-#define _USE_MATH_DEFINES
-#include <cmath>
-#include <iostream>
-
-using data_t = double;
-
-constexpr data_t SQRT2_1 = 0.7071067811865475;
-constexpr data_t ONE_SUB_SQRT2_1 = 1 - 0.7071067811865475;
-
-static inline data_t e(int n) {
-  // if (n == 0)
-  //   return SQRT2_1;
-  return 1.0f;
-}
-
-
-static inline data_t dct_naive_single(data_t *buf, size_t buf_size, size_t i) {
-  data_t sum = 0.0f;
-  for (size_t j = 0; j < buf_size; j++) {
-    // sum += buf[j] * cos((2 * j + 1) * i * M_PI / (2 * buf_size));
-    sum += buf[j] * cos(M_PI / buf_size * i * (j + 0.5));
-  }
-  return sum;
-}
-
-static inline data_t idct_naive_single(data_t *buf, size_t buf_size, size_t i) {
-  data_t sum = 0.0;
-  for (size_t j = 0; j < buf_size; j++) {
-    sum += buf[j] * cos(M_PI * i * (2 * j + 1.0) / (2 * buf_size));
-  }
-  return sum;
-}
-
-void dct_naive(data_t *buf, size_t buf_size, data_t *out) {
-  
-  for (int i = 0; i < buf_size; i++) {
-    out[i] = 2.0 * e(i) * dct_naive_single(buf, buf_size, i);
-  }
-}
-
-void idct_naive(data_t *buf, size_t buf_size, data_t *out) {
-  
-  for (int i = 0; i < buf_size; i++) {
-    out[i] = (buf[0] + 2 * idct_naive_single(buf, buf_size, i)) / buf_size;
-  }
-}
-
-double err(data_t* buf1, data_t* buf2, size_t buf_size) {
-  double sum = 0.0;
-  for(int i = 0; i < buf_size; i++)
-    sum += abs(buf1[i] - buf2[i]);
-  return sum;
-}
-
-py::array fct(py::array_t<double, py::array::c_style | py::array::forcecast>  in_arr) {
-  py::buffer_info buf_info = in_arr.request();
-  double *ptr = static_cast<double*>(buf_info.ptr);
-  size_t buf_size = buf_info.size;
-
-  std::cout << "fct: Get array with " << buf_info.size << std::endl;
-  std::cout << "buf: " << ptr[0] << " " << ptr[1] << std::endl;
-  
-  py::array_t<double> out_arr(buf_info.shape);
-
-  double *out_buf = static_cast<double*>(out_arr.request(true).ptr);
-  dct_naive(ptr, buf_size, out_buf);
-  
-  return out_arr;
-}
-
-py::array ifct(py::array_t<double, py::array::c_style | py::array::forcecast>  in_arr) {
-  py::buffer_info buf_info = in_arr.request();
-  double *ptr = static_cast<double*>(buf_info.ptr);
-  size_t buf_size = buf_info.size;
-
-  std::cout << "ifct: Get array with " << buf_info.size << std::endl;
-  std::cout << "buf: " << ptr[0] << " " << ptr[1] << std::endl;
-  
-  py::array_t<double> out_arr(buf_info.shape);
-
-  double *out_buf = static_cast<double*>(out_arr.request(true).ptr);
-  idct_naive(ptr, buf_size, out_buf);
-  
-  return out_arr;
-
-}
-
-PYBIND11_MODULE(fdct, m) {
-  m.def("fct", &fct, "Compute fast discrete cosine transformation.");
-  m.def("ifct", &ifct, "Compute inverse fast discrete cosine transformation.");
-}
diff --git a/src/fdct.hpp b/src/fdct.hpp
new file mode 100644
index 0000000..476d7fc
--- /dev/null
+++ b/src/fdct.hpp
@@ -0,0 +1,111 @@
+#pragma once
+
+#include <fftw3.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <cassert>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <iostream>
+
+// #include "FftComplex.hpp"
+#include "complex_array.hpp"
+#include "fdct_inner.hpp"
+#include "fft.hpp"
+
+namespace py = pybind11;
+#define _USE_MATH_DEFINES
+
+// Wrapper function to accept and return NumPy arrays
+py::array fdct(
+    py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
+    // Request a buffer from the input array
+    py::buffer_info buf = in_arr.request();
+    // Check if the input array is 1-dimensional or 2-dimensional
+
+    if (buf.ndim > 2) {
+        throw std::runtime_error(
+            "Input array must be 1-dimensional or 2-dimensional.");
+    }
+
+    // Convert input NumPy array to std::vector
+    size_t N = buf.shape[0];
+    ComplexArray input_data(N);
+    std::copy(static_cast<double *>(buf.ptr),
+              static_cast<double *>(buf.ptr) + N, input_data.get_real_ptr());
+    std::fill(input_data.get_imag_ptr(), input_data.get_imag_ptr() + N, 0.0);
+    // Perform FCT
+    double *result = new double[N];
+    fct_inner(input_data, result, simd);
+
+    // Convert result back to NumPy array
+    return py::array_t<double>(N, result);
+}
+
+std::vector<double> idct_inner(const std::vector<double> &c) {
+    size_t N = c.size();
+    std::vector<double> x(N);
+    // auto u = reorderOutput(c);
+    ComplexArray u(N);
+    // Allocate FFTW input/output arrays
+    fftw_complex *fft_in =
+        (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
+    fftw_complex *fft_out =
+        (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
+
+    // Initialize FFTW plan
+    fftw_plan plan =
+        fftw_plan_dft_1d(N, fft_in, fft_out, FFTW_BACKWARD, FFTW_ESTIMATE);
+
+    // Fill FFTW input array
+    for (size_t n = 0; n < N; ++n) {
+        fft_in[n][0] = c[n];  // Real part
+        fft_in[n][1] = 0.0;   // Imaginary part
+    }
+
+    // Execute IFFT
+    fftw_execute(plan);
+
+    // Compute the IDCT using the IFFT output
+    for (size_t k = 0; k < N; ++k) {
+        double real = fft_out[k][0];
+        double imag = fft_out[k][1];
+        x[k] = (real * cos(M_PI * k / (2.0 * N)) +
+                imag * sin(M_PI * k / (2.0 * N))) /
+               N;
+    }
+
+    // Free FFTW resources
+    fftw_destroy_plan(plan);
+    fftw_free(fft_in);
+    fftw_free(fft_out);
+
+    return x;
+}
+
+// Wrapper function to accept and return NumPy arrays
+py::array ifdct(
+    py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
+    // Request a buffer from the input array
+    py::buffer_info buf = in_arr.request();
+    if (buf.ndim != 1) {
+        throw std::runtime_error("Input array must be 1-dimensional.");
+    }
+
+    // Convert input NumPy array to std::vector
+    size_t N = buf.shape[0];
+    // std::vector<double> input_data(static_cast<double *>(buf.ptr),
+    //                               static_cast<double *>(buf.ptr) + N);
+    ComplexArray input_data(N);
+    std::copy(static_cast<double *>(buf.ptr),
+              static_cast<double *>(buf.ptr) + N, input_data.get_real_ptr());
+
+    // Perform IDCT
+    double *result = new double[N];
+    ifdct_inner(input_data, result, simd);
+    // Convert result back to NumPy array
+    return py::array_t<double>(N, result);
+}
diff --git a/src/fdct_inner.cpp b/src/fdct_inner.cpp
new file mode 100644
index 0000000..4b717dd
--- /dev/null
+++ b/src/fdct_inner.cpp
@@ -0,0 +1,106 @@
+
+#include <fftw3.h>
+
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <iostream>
+#include <vector>
+
+#include "complex_array.hpp"
+#include "fdct_inner.hpp"
+#include "fft.hpp"
+
+// Function to compute the Fast Cosine Transform
+
+ComplexArray reorderInput(const ComplexArray &x) {
+    size_t N = x.size();
+    ComplexArray u(N);
+    const double *x_real = x.get_real_ptr();
+    const double *x_imag = x.get_imag_ptr();
+    double *u_real = u.get_real_ptr();
+    double *u_imag = u.get_imag_ptr();
+
+    for (size_t n = 0; n < N / 2; ++n) {
+        u_real[n] = x_real[2 * n];
+        u_imag[n] = x_imag[2 * n];
+        u_real[N - n - 1] = x_real[2 * n + 1];
+        u_imag[N - n - 1] = x_imag[2 * n + 1];
+    }
+    if (N % 2 != 0) {
+        u_real[N / 2] = x_real[N - 1];
+        u_imag[N / 2] = x_imag[N - 1];
+    }
+    return u;
+}
+
+ComplexArray reorderOutput(const ComplexArray &v) {
+    size_t N = v.size();
+    ComplexArray x(N);
+
+    // Access the real and imaginary parts directly
+    const double *v_real = v.get_real_ptr();
+    const double *v_imag = v.get_imag_ptr();
+    double *x_real = x.get_real_ptr();
+    double *x_imag = x.get_imag_ptr();
+
+    // Reordering according to the formula
+    for (size_t n = 0; n < N / 2; ++n) {
+        x_real[2 * n] = v_real[n];
+        x_imag[2 * n] = v_imag[n];
+        x_real[2 * n + 1] = v_real[N - n - 1];
+        x_imag[2 * n + 1] = v_imag[N - n - 1];
+    }
+    if (N % 2 != 0) {
+        x_real[N - 1] = v_real[N / 2];
+        x_imag[N - 1] = v_imag[N / 2];
+    }
+
+    return x;
+}
+
+void idctPreprocess(ComplexArray &x) {
+    size_t N = x.size();
+    auto real = x.get_real_ptr();
+    auto imag = x.get_imag_ptr();
+
+    real[0] /= 2;
+    for (size_t n = 0; n < N; ++n) {
+        double tmp = n * M_PI / (2 * N);
+        double tmp_real = real[n];
+        real[n] = tmp_real * cos(tmp);
+        imag[n] = -tmp_real * sin(tmp);
+    }
+}
+
+
+void fct_inner(const ComplexArray &x, double *result, bool simd) {
+    size_t N = x.size();
+    auto fft_in = reorderInput(x);
+
+    // Perform FFT
+    fft::transform(fft_in, false);
+
+    // Compute the DCT using the FFT output
+    double *x_real = fft_in.get_real_ptr();
+    double *x_imag = fft_in.get_imag_ptr();
+    for (size_t k = 0; k < N; ++k) {
+        result[k] =
+            (x_real[k] * cos(M_PI * k / (2 * N)) - x_imag[k] * sin(-M_PI * k / (2 * N))) *
+            2;
+    }
+}
+
+void ifdct_inner(const ComplexArray &c, double *result, bool simd) {
+    size_t N = c.size();
+    ComplexArray fft_in = c;
+    idctPreprocess(fft_in);
+
+    // Perform FFT
+    fft::transform(fft_in, false);
+
+    auto x_ = reorderOutput(fft_in);
+    for (size_t n = 0; n < N; ++n) {
+        result[n] = x_.get_real_ptr()[n] / N;
+    }
+}
diff --git a/src/fdct_inner.hpp b/src/fdct_inner.hpp
new file mode 100644
index 0000000..38dd6a3
--- /dev/null
+++ b/src/fdct_inner.hpp
@@ -0,0 +1,17 @@
+
+#pragma once
+#include <iostream>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <vector>
+
+#include "complex_array.hpp"
+
+// Function to compute the Fast Cosine Transform
+
+ComplexArray reorderInput(const ComplexArray &x);
+
+void fct_inner(const ComplexArray &x, double *result, bool simd);
+
+void ifdct_inner(const ComplexArray &c, double *result, bool simd);
diff --git a/src/fdct_module.cpp b/src/fdct_module.cpp
new file mode 100644
index 0000000..38c5d1e
--- /dev/null
+++ b/src/fdct_module.cpp
@@ -0,0 +1,16 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+#include "fdct.hpp"
+#include "dct_naive.hpp"
+
+namespace py = pybind11;
+#define _USE_MATH_DEFINES
+#include <cmath>
+
+PYBIND11_MODULE(fdct, m) {
+  m.def("dct_naive", &dct_naive, "Compute fast discrete cosine transformation.");
+  m.def("idct_naive", &idct_naive, "Compute inverse fast discrete cosine transformation.");
+  m.def("fdct", &fdct, "Compute fast discrete cosine transformation.");
+  m.def("ifdct", &ifdct, "Compute inverse fast discrete cosine transformation.");
+}
diff --git a/src/fft.cpp b/src/fft.cpp
new file mode 100644
index 0000000..0e8bfb0
--- /dev/null
+++ b/src/fft.cpp
@@ -0,0 +1,156 @@
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdlib>
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+#include <stdint.h>
+
+#include "complex_array.hpp"
+#include "fft.hpp"
+
+static size_t reverseBits(size_t val, int width);
+
+static ComplexArray mkExpTable(size_t n, bool inverse) {
+  ComplexArray expTable(n);
+  double *real = expTable.get_real_ptr();
+  double *imag = expTable.get_imag_ptr();
+  for (size_t i = 0; i < n; i++) {
+    double angle = (inverse ? 2 : -2) * M_PI * i / n;
+    real[i] = std::cos(angle);
+    imag[i] = std::sin(angle);
+  }
+  return expTable;
+}
+
+void fft::transformRadix2(ComplexArray &x, bool inverse, bool simd) {
+  int levels = 0;
+  size_t n = x.size();
+  double *real = x.get_real_ptr();
+  double *imag = x.get_imag_ptr();
+  
+  for (size_t temp = n; temp > 1U; temp >>= 1)
+    levels++;
+  if (static_cast<size_t>(1U) << levels != n)
+    throw std::domain_error("Length is not a power of 2");
+
+  ComplexArray expTable = mkExpTable(n, inverse);
+
+
+  for (size_t i = 0; i < n; i++) {
+    size_t j = reverseBits(i, levels);
+    if (j > i) {
+      std::swap(real[i], real[j]);
+      std::swap(imag[i], imag[j]);
+    }
+  }
+
+  double *real_expTable = expTable.get_real_ptr();
+  double *imag_expTable = expTable.get_imag_ptr();
+
+  for (size_t size = 2; size <= n; size *= 2) {
+    size_t halfsize = size / 2;
+    size_t tablestep = n / size;
+    for (size_t i = 0; i < n; i += size) {
+      for (size_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) {
+        double temp_real = real[j + halfsize] * real_expTable[k] -
+                           imag[j + halfsize] * imag_expTable[k];
+        double temp_imag = real[j + halfsize] * imag_expTable[k] +
+                           imag[j + halfsize] * real_expTable[k];
+        real[j + halfsize] = real[j] - temp_real;
+        imag[j + halfsize] = imag[j] - temp_imag;
+        real[j] += temp_real;
+        imag[j] += temp_imag;
+      }
+    }
+    if (size == n)
+      break;
+  }
+}
+
+void fft::transformBluestein(ComplexArray &x, bool inverse, bool simd) {
+  size_t n = x.size();
+  double *real = x.get_real_ptr();
+  double *imag = x.get_imag_ptr();
+  size_t m = 1;
+  while (m / 2 <= n) {
+    if (m > SIZE_MAX / 2)
+      throw std::length_error("Vector too large");
+    m *= 2;
+  }
+
+  // Trigonometric table
+  ComplexArray expTable(n);
+  double *real_expTable = expTable.get_real_ptr();
+  double *imag_expTable = expTable.get_imag_ptr();
+  for (size_t i = 0; i < n; i++) {
+    uintmax_t temp = static_cast<uintmax_t>(i) * i;
+    temp %= static_cast<uintmax_t>(n) * 2;
+    double angle = (inverse ? M_PI : -M_PI) * temp / n;
+    expTable[i] = std::polar(1.0, angle);
+  }
+  ComplexArray a(m), b(m);
+  double *real_avec = a.get_real_ptr();
+  double *imag_avec = a.get_imag_ptr();
+  
+  
+  // TODO: Use SIMD
+  for (size_t i = 0; i < n; i++) {
+    real_avec[i] = real[i] * real_expTable[i] - imag[i] * imag_expTable[i];
+    imag_avec[i] = real[i] * imag_expTable[i] + imag[i] * real_expTable[i];
+    // a[i] = Complex(x[i]) * Complex(expTable[i]);
+  }
+  
+  double *real_bvec = b.get_real_ptr();
+  double *imag_bvec = b.get_imag_ptr();
+  real_bvec[0] = real_expTable[0];
+  imag_bvec[0] = imag_expTable[0];
+  
+  for (size_t i = 1; i < n; ++i) {
+    real_bvec[i] = real_bvec[m - i] = real_expTable[i];
+    imag_bvec[i] = imag_bvec[m - i] = -imag_expTable[i];
+  }
+  
+  convolove(a, b, simd);
+  // x = c * expTable;
+  for (size_t i = 0; i < n; i++) {
+    real[i] = a.get_real_ptr()[i] * real_expTable[i] - a.get_imag_ptr()[i] * imag_expTable[i];
+    imag[i] = a.get_real_ptr()[i] * imag_expTable[i] + a.get_imag_ptr()[i] * real_expTable[i];
+  }
+}
+
+
+// compute the convolution of two complex arrays and set the result to x
+void fft::convolove(ComplexArray &_x, const ComplexArray &_y, bool simd) {
+  size_t n = _x.size();
+  if (n != _y.size())
+    throw std::domain_error("Mismatched lengths");
+  
+  ComplexArray& x = _x;
+  ComplexArray y = _y;
+
+  transform(x, false, simd);
+  transform(y, false, simd);
+
+  x.multiply(y, simd);
+  transform(x, true, simd);
+  x.scale(1.0 / n, simd);
+}
+
+void fft::transform(ComplexArray &x, bool inverse, bool simd) {
+  size_t n = x.size();
+  if (n == 0)
+    return;
+  else if ((n & (n - 1)) == 0) // Is power of 2
+    transformRadix2(x, inverse, simd);
+  else // More complicated algorithm for arbitrary sizes
+    transformBluestein(x, inverse, simd);
+}
+
+static size_t reverseBits(size_t val, int width) {
+  size_t result = 0;
+  for (int i = 0; i < width; i++, val >>= 1)
+    result = (result << 1) | (val & 1U);
+  return result;
+}
diff --git a/src/fft.hpp b/src/fft.hpp
new file mode 100644
index 0000000..868eee7
--- /dev/null
+++ b/src/fft.hpp
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "complex_array.hpp"
+
+namespace fft {
+void transformRadix2(ComplexArray &x, bool inverse, bool simd = false);
+
+void transformBluestein(ComplexArray &x, bool inverse, bool simd = false);
+
+void transform(ComplexArray &x, bool inverse, bool simd = false);
+
+void convolove(ComplexArray &x, const ComplexArray &y, bool simd = false);
+} // namespace fft
diff --git a/test_dct.py b/test_dct.py
index a8707cc..092c6f7 100644
--- a/test_dct.py
+++ b/test_dct.py
@@ -1,25 +1,28 @@
+import pytest
 import numpy as np
 import fdct
-import scipy
+import scipy.fft as fft
+import time
 
-def test_dct():
-    inp = np.random.rand(1024)
-    gt = scipy.fft.dct(inp)
-    ans = fdct.fct(inp)
-    s = sum((gt - ans) ** 2) / 1024
-    assert s < 0.1
+# Parameters
+test_size_seq = [10, 64, 127, 256, 1023, 2048, 4097, 8192, 8193]
+tolerance = 1e-6  # Tolerance for individual element comparison
+simd = [False, True]  # SIMD flag
 
-def test_idct():
-    inp = np.random.rand(1024)
-    gt = scipy.fft.idct(inp)
-    ans = fdct.ifct(inp)
-    print(gt)
-    print(ans)
-    s = sum((gt - ans) ** 2) / 1024
-    assert s < 0.1
-    
+# List of test functions and their corresponding references
+test_cases = [
+    ("dct_naive", fdct.dct_naive, fft.dct),
+    ("idct_naive", fdct.idct_naive, fft.idct),
+    ("fdct", fdct.fdct, fft.dct),
+    ("ifdct", fdct.ifdct, fft.idct),
+]
+
+@pytest.mark.parametrize("size", test_size_seq)
+@pytest.mark.parametrize("name, func, ref_func", test_cases)
+@pytest.mark.parametrize("simd", simd)
+def test_transform(name, func, ref_func, size, simd):
+    inp = np.random.rand(size)
+    gt = ref_func(inp)
+    ans = func(inp, simd)
+    np.testing.assert_allclose(ans, gt, atol=tolerance, rtol=0)
 
-    
-if __name__ == "__main__":
-    test_dct()
-    test_idct()