diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4afb28a..de69a9a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,15 @@
 cmake_minimum_required(VERSION 3.5)
 project(fdct)
 
+add_compile_options(-Wall)
+
+# AVX2
+add_compile_options(-mavx2)
+
+# OpenMP
+find_package(OpenMP REQUIRED)
+add_compile_options(${OpenMP_CXX_FLAGS})
+
 # Find Python
 find_package(PythonLibs REQUIRED)
 include_directories(${PYTHON_INCLUDE_DIRS})
@@ -8,10 +17,10 @@ include_directories(${PYTHON_INCLUDE_DIRS})
 # Find Pybind11
 find_package(pybind11 REQUIRED)
 
-find_package(PkgConfig REQUIRED)
-pkg_search_module(FFTW REQUIRED fftw3 IMPORTED_TARGET)
-include_directories(PkgConfig::FFTW)
-link_libraries(PkgConfig::FFTW)
+# find_package(PkgConfig REQUIRED)
+# pkg_search_module(FFTW REQUIRED fftw3 IMPORTED_TARGET)
+# include_directories(PkgConfig::FFTW)
+# link_libraries(PkgConfig::FFTW)
 
 pybind11_add_module(fdct 
     src/fdct_module.cpp
@@ -21,3 +30,4 @@ pybind11_add_module(fdct
 
 # Link the module with Python
 target_link_libraries(fdct PRIVATE ${PYTHON_LIBRARIES})
+target_link_libraries(fdct PRIVATE OpenMP::OpenMP_CXX)
diff --git a/README.org b/README.org
index e1782c1..6ed7575 100644
--- a/README.org
+++ b/README.org
@@ -43,7 +43,7 @@ modern multicore processors.
 :PROPERTIES:
 :CUSTOM_ID: prospective-users
 :END:
-FDCP will be valuable in fieds where DCT is used extensively:
+FDCT will be valuable in fieds where DCT is used extensively:
 
 - Image processing and compression
 - Audio processing and compression
@@ -136,3 +136,7 @@ multithread=True, simd=True)
    [[https://coehuman.uodiyala.edu.iq/uploads/Coehuman%20library%20pdf/%D9%83%D8%AA%D8%A8%20%D8%A7%D9%84%D8%B1%D9%8A%D8%A7%D8%B6%D9%8A%D8%A7%D8%AA%20Mathematics%20books/Wavelets/25%20(2).pdf]]
 3. C. W. Kok, Fast algorithm for computing discrete cosine transform
    [[https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=558495]]
+4. Nayuki, Fast discrete cosine transform algorithms
+   [[https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms]]
+5. Nayuki, Free small FFT in multiple languages
+   [[https://www.nayuki.io/page/free-small-fft-in-multiple-languages]]
diff --git a/ref/README.org b/ref/README.org
index 74fd6b9..2befb38 100644
--- a/ref/README.org
+++ b/ref/README.org
@@ -2,4 +2,6 @@
 
 This reference implementation is provided by Nayuki. I greatly appreciate their contribution.
 
-For more details, please visit Nayuki's [[Website][https://www.nayuki.io/]]
+For more details, please visit Nayuki's website
+- https://www.nayuki.io/page/free-small-fft-in-multiple-languages
+- https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
diff --git a/src/complex_array.hpp b/src/complex_array.hpp
index 911d8fa..e199c5b 100644
--- a/src/complex_array.hpp
+++ b/src/complex_array.hpp
@@ -1,5 +1,6 @@
-#ifndef COMPLEX_ARRAY_HPP
-#define COMPLEX_ARRAY_HPP
+#pragma once
+#include <immintrin.h>
+
 #include <algorithm>
 #include <complex>
 #include <stdexcept>
@@ -101,37 +102,65 @@ class ComplexArray {
         if (array_size != other.array_size) {
             throw std::invalid_argument("Array sizes do not match");
         }
-
-        for (size_t i = 0; i < array_size; i++) {
-            double temp_real =
-                real[i] * other.real[i] - imag[i] * other.imag[i];
-            double temp_imag =
-                real[i] * other.imag[i] + imag[i] * other.real[i];
-            real[i] = temp_real;
-            imag[i] = temp_imag;
+        if (simd) {
+            for (size_t i = 0; i <= array_size - 4; i += 4) {
+                __m256d real_vec = _mm256_loadu_pd(real + i);
+                __m256d imag_vec = _mm256_loadu_pd(imag + i);
+                __m256d other_real_vec = _mm256_loadu_pd(other.real + i);
+                __m256d other_imag_vec = _mm256_loadu_pd(other.imag + i);
+
+                __m256d temp_real =
+                    _mm256_sub_pd(_mm256_mul_pd(real_vec, other_real_vec),
+                                  _mm256_mul_pd(imag_vec, other_imag_vec));
+                __m256d temp_imag =
+                    _mm256_add_pd(_mm256_mul_pd(real_vec, other_imag_vec),
+                                  _mm256_mul_pd(imag_vec, other_real_vec));
+
+                _mm256_storeu_pd(real + i, temp_real);
+                _mm256_storeu_pd(imag + i, temp_imag);
+            }
+            for (size_t i = array_size - array_size % 4; i < array_size; i++) {
+                double temp_real =
+                    real[i] * other.real[i] - imag[i] * other.imag[i];
+                double temp_imag =
+                    real[i] * other.imag[i] + imag[i] * other.real[i];
+                real[i] = temp_real;
+                imag[i] = temp_imag;
+            }
+        } else {
+            for (size_t i = 0; i < array_size; i++) {
+                double temp_real =
+                    real[i] * other.real[i] - imag[i] * other.imag[i];
+                double temp_imag =
+                    real[i] * other.imag[i] + imag[i] * other.real[i];
+                real[i] = temp_real;
+                imag[i] = temp_imag;
+            }
         }
     }
 
-    // void partial_multiply(const ComplexArray &other, size_t start, size_t length) {
-        
-    //     size_t end = start + length;
-    //     if (end > array_size || end > other.array_size) {
-    //         throw std::invalid_argument("Array sizes do not match");
-    //     }
-    //     for (size_t i = 0; i < length; i++) {
-    //         double temp_real =
-    //             real[start + i] * other.real[i] - imag[start + i] * other.imag[i];
-    //         double temp_imag =
-    //             real[start + i] * other.imag[i] + imag[start + i] * other.real[i];
-    //         real[start + i] = temp_real;
-    //         imag[start + i] = temp_imag;
-    //     }
-    // }
-
     void scale(double scalar, bool simd = false) {
-        for (size_t i = 0; i < array_size; i++) {
-            real[i] *= scalar;
-            imag[i] *= scalar;
+        if (simd) {
+            __m256d scalar_vec = _mm256_set1_pd(scalar);
+            for (size_t i = 0; i <= array_size - 4; i += 4) {
+                __m256d real_vec = _mm256_loadu_pd(real + i);
+                __m256d imag_vec = _mm256_loadu_pd(imag + i);
+
+                real_vec = _mm256_mul_pd(real_vec, scalar_vec);
+                imag_vec = _mm256_mul_pd(imag_vec, scalar_vec);
+
+                _mm256_storeu_pd(real + i, real_vec);
+                _mm256_storeu_pd(imag + i, imag_vec);
+            }
+            for (size_t i = array_size - array_size % 4; i < array_size; i++) {
+                real[i] *= scalar;
+                imag[i] *= scalar;
+            }
+        } else {
+            for (size_t i = 0; i < array_size; i++) {
+                real[i] *= scalar;
+                imag[i] *= scalar;
+            }
         }
     }
 
@@ -165,8 +194,6 @@ class ComplexArray {
         return ComplexProxy(real + index, imag + index);
     }
 
-    
-
     size_t size() const { return array_size; }
 
     double *get_real_ptr() { return real; }
@@ -177,5 +204,3 @@ class ComplexArray {
 
     const double *get_imag_ptr() const { return imag; }
 };
-
-#endif
diff --git a/src/dct_naive.hpp b/src/dct_naive.hpp
index 07e19fe..bb68728 100644
--- a/src/dct_naive.hpp
+++ b/src/dct_naive.hpp
@@ -12,7 +12,7 @@ namespace py = pybind11;
 constexpr double SQRT2_1 = 0.7071067811865475;
 constexpr double ONE_SUB_SQRT2_1 = 1 - 0.7071067811865475;
 
-static inline double dct_naive_single(double *buf, size_t buf_size, size_t i, bool simd) {
+static inline double dct_naive_single(double *buf, size_t buf_size, size_t i) {
   double sum = 0.0f;
   for (size_t j = 0; j < buf_size; j++) {
     sum += buf[j] * cos(M_PI / buf_size * i * (j + 0.5));
@@ -28,20 +28,22 @@ static inline double idct_naive_single(double *buf, size_t buf_size, size_t i) {
   return sum;
 }
 
-void dct_naive_inner(double *buf, size_t buf_size, double *out, bool simd) {
+void dct_naive_inner(double *buf, size_t buf_size, double *out) {
+#pragma omp parallel for
   for (size_t i = 0; i < buf_size; i++) {
-    out[i] = 2.0 * dct_naive_single(buf, buf_size, i, simd);
+    out[i] = 2.0 * dct_naive_single(buf, buf_size, i);
   }
 }
 
-void idct_naive_inner(double *buf, size_t buf_size, double *out, bool simd) {
+void idct_naive_inner(double *buf, size_t buf_size, double *out) {
+#pragma omp parallel for
   for (size_t i = 0; i < buf_size; i++) {
     out[i] = (buf[0] + (2 * idct_naive_single(buf, buf_size, i))) / (2.0 * buf_size);
   }
 }
 
 py::array dct_naive(
-  py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
+  py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
   py::buffer_info buf_info = in_arr.request();
   double *ptr = static_cast<double *>(buf_info.ptr);
   size_t buf_size = buf_info.size;
@@ -49,13 +51,13 @@ py::array dct_naive(
   py::array_t<double> out_arr(buf_info.shape);
 
   double *out_buf = static_cast<double *>(out_arr.request(true).ptr);
-  dct_naive_inner(ptr, buf_size, out_buf, simd);
+  dct_naive_inner(ptr, buf_size, out_buf);
 
   return out_arr;
 }
 
 py::array idct_naive(
-  py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
+  py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
   py::buffer_info buf_info = in_arr.request();
   double *ptr = static_cast<double *>(buf_info.ptr);
   size_t buf_size = buf_info.size;
@@ -63,7 +65,7 @@ py::array idct_naive(
   py::array_t<double> out_arr(buf_info.shape);
 
   double *out_buf = static_cast<double *>(out_arr.request(true).ptr);
-  idct_naive_inner(ptr, buf_size, out_buf, simd);
+  idct_naive_inner(ptr, buf_size, out_buf);
 
   return out_arr;
 }
diff --git a/src/fdct.hpp b/src/fdct.hpp
index 476d7fc..64b8fdf 100644
--- a/src/fdct.hpp
+++ b/src/fdct.hpp
@@ -21,7 +21,7 @@ namespace py = pybind11;
 
 // Wrapper function to accept and return NumPy arrays
 py::array fdct(
-    py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
+    py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
     // Request a buffer from the input array
     py::buffer_info buf = in_arr.request();
     // Check if the input array is 1-dimensional or 2-dimensional
@@ -38,57 +38,16 @@ py::array fdct(
               static_cast<double *>(buf.ptr) + N, input_data.get_real_ptr());
     std::fill(input_data.get_imag_ptr(), input_data.get_imag_ptr() + N, 0.0);
     // Perform FCT
-    double *result = new double[N];
-    fct_inner(input_data, result, simd);
-
+    py::array_t<double, py::array::c_style> result(N);
+    double *result_ptr = static_cast<double *>(result.request().ptr);
+    fct_inner(input_data, result_ptr);
     // Convert result back to NumPy array
-    return py::array_t<double>(N, result);
-}
-
-std::vector<double> idct_inner(const std::vector<double> &c) {
-    size_t N = c.size();
-    std::vector<double> x(N);
-    // auto u = reorderOutput(c);
-    ComplexArray u(N);
-    // Allocate FFTW input/output arrays
-    fftw_complex *fft_in =
-        (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
-    fftw_complex *fft_out =
-        (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N);
-
-    // Initialize FFTW plan
-    fftw_plan plan =
-        fftw_plan_dft_1d(N, fft_in, fft_out, FFTW_BACKWARD, FFTW_ESTIMATE);
-
-    // Fill FFTW input array
-    for (size_t n = 0; n < N; ++n) {
-        fft_in[n][0] = c[n];  // Real part
-        fft_in[n][1] = 0.0;   // Imaginary part
-    }
-
-    // Execute IFFT
-    fftw_execute(plan);
-
-    // Compute the IDCT using the IFFT output
-    for (size_t k = 0; k < N; ++k) {
-        double real = fft_out[k][0];
-        double imag = fft_out[k][1];
-        x[k] = (real * cos(M_PI * k / (2.0 * N)) +
-                imag * sin(M_PI * k / (2.0 * N))) /
-               N;
-    }
-
-    // Free FFTW resources
-    fftw_destroy_plan(plan);
-    fftw_free(fft_in);
-    fftw_free(fft_out);
-
-    return x;
+    return result;
 }
 
 // Wrapper function to accept and return NumPy arrays
 py::array ifdct(
-    py::array_t<double, py::array::c_style | py::array::forcecast> in_arr, bool simd) {
+    py::array_t<double, py::array::c_style | py::array::forcecast> in_arr) {
     // Request a buffer from the input array
     py::buffer_info buf = in_arr.request();
     if (buf.ndim != 1) {
@@ -105,7 +64,8 @@ py::array ifdct(
 
     // Perform IDCT
     double *result = new double[N];
-    ifdct_inner(input_data, result, simd);
+    ifdct_inner(input_data, result);
+    // std::fill(result, result + N, 0.0);
     // Convert result back to NumPy array
     return py::array_t<double>(N, result);
 }
diff --git a/src/fdct_inner.cpp b/src/fdct_inner.cpp
index 4b717dd..71d3ac4 100644
--- a/src/fdct_inner.cpp
+++ b/src/fdct_inner.cpp
@@ -13,6 +13,7 @@
 
 // Function to compute the Fast Cosine Transform
 
+
 ComplexArray reorderInput(const ComplexArray &x) {
     size_t N = x.size();
     ComplexArray u(N);
@@ -74,7 +75,7 @@ void idctPreprocess(ComplexArray &x) {
 }
 
 
-void fct_inner(const ComplexArray &x, double *result, bool simd) {
+void fct_inner(const ComplexArray &x, double *result) {
     size_t N = x.size();
     auto fft_in = reorderInput(x);
 
@@ -84,6 +85,7 @@ void fct_inner(const ComplexArray &x, double *result, bool simd) {
     // Compute the DCT using the FFT output
     double *x_real = fft_in.get_real_ptr();
     double *x_imag = fft_in.get_imag_ptr();
+#pragma omp parallel for
     for (size_t k = 0; k < N; ++k) {
         result[k] =
             (x_real[k] * cos(M_PI * k / (2 * N)) - x_imag[k] * sin(-M_PI * k / (2 * N))) *
@@ -91,7 +93,7 @@ void fct_inner(const ComplexArray &x, double *result, bool simd) {
     }
 }
 
-void ifdct_inner(const ComplexArray &c, double *result, bool simd) {
+void ifdct_inner(const ComplexArray &c, double *result) {
     size_t N = c.size();
     ComplexArray fft_in = c;
     idctPreprocess(fft_in);
diff --git a/src/fdct_inner.hpp b/src/fdct_inner.hpp
index 38dd6a3..cdbcd36 100644
--- a/src/fdct_inner.hpp
+++ b/src/fdct_inner.hpp
@@ -12,6 +12,6 @@
 
 ComplexArray reorderInput(const ComplexArray &x);
 
-void fct_inner(const ComplexArray &x, double *result, bool simd);
+void fct_inner(const ComplexArray &x, double *result);
 
-void ifdct_inner(const ComplexArray &c, double *result, bool simd);
+void ifdct_inner(const ComplexArray &c, double *result);
diff --git a/src/fdct_module.cpp b/src/fdct_module.cpp
index 38c5d1e..fa6b56d 100644
--- a/src/fdct_module.cpp
+++ b/src/fdct_module.cpp
@@ -3,14 +3,25 @@
 #include <pybind11/stl.h>
 #include "fdct.hpp"
 #include "dct_naive.hpp"
+#include "complex_array.hpp"
 
 namespace py = pybind11;
 #define _USE_MATH_DEFINES
 #include <cmath>
 
+void _set_num_thread(size_t n) {
+  fft::set_num_threads(n);
+}
+
+void _set_simd(bool s) {
+  fft::set_simd(s);
+}
+
 PYBIND11_MODULE(fdct, m) {
   m.def("dct_naive", &dct_naive, "Compute fast discrete cosine transformation.");
   m.def("idct_naive", &idct_naive, "Compute inverse fast discrete cosine transformation.");
   m.def("fdct", &fdct, "Compute fast discrete cosine transformation.");
   m.def("ifdct", &ifdct, "Compute inverse fast discrete cosine transformation.");
+  m.def("set_num_threads", &_set_num_thread, "Set the number of threads to use.");
+  m.def("set_simd", _set_simd, "Set whether to use SIMD instructions.");
 }
diff --git a/src/fft.cpp b/src/fft.cpp
index 0e8bfb0..bf6f847 100644
--- a/src/fft.cpp
+++ b/src/fft.cpp
@@ -1,3 +1,10 @@
+#include "fft.hpp"
+
+#include <immintrin.h>
+#include <omp.h>
+#include <stddef.h>
+#include <stdint.h>
+
 #include <cmath>
 #include <complex>
 #include <cstddef>
@@ -5,152 +12,255 @@
 #include <iostream>
 #include <stdexcept>
 #include <vector>
-#include <stdint.h>
 
 #include "complex_array.hpp"
-#include "fft.hpp"
 
 static size_t reverseBits(size_t val, int width);
 
+void fft::set_num_threads(size_t n) { omp_set_num_threads(n); }
+
+void fft::set_simd(bool s) { simd = s; }
+
 static ComplexArray mkExpTable(size_t n, bool inverse) {
-  ComplexArray expTable(n);
-  double *real = expTable.get_real_ptr();
-  double *imag = expTable.get_imag_ptr();
-  for (size_t i = 0; i < n; i++) {
-    double angle = (inverse ? 2 : -2) * M_PI * i / n;
-    real[i] = std::cos(angle);
-    imag[i] = std::sin(angle);
-  }
-  return expTable;
+    ComplexArray expTable(n);
+    double *real = expTable.get_real_ptr();
+    double *imag = expTable.get_imag_ptr();
+    for (size_t i = 0; i < n; i++) {
+        double angle = (inverse ? 2 : -2) * M_PI * i / n;
+        real[i] = std::cos(angle);
+        imag[i] = std::sin(angle);
+    }
+    return expTable;
 }
 
-void fft::transformRadix2(ComplexArray &x, bool inverse, bool simd) {
-  int levels = 0;
-  size_t n = x.size();
-  double *real = x.get_real_ptr();
-  double *imag = x.get_imag_ptr();
-  
-  for (size_t temp = n; temp > 1U; temp >>= 1)
-    levels++;
-  if (static_cast<size_t>(1U) << levels != n)
-    throw std::domain_error("Length is not a power of 2");
-
-  ComplexArray expTable = mkExpTable(n, inverse);
-
-
-  for (size_t i = 0; i < n; i++) {
-    size_t j = reverseBits(i, levels);
-    if (j > i) {
-      std::swap(real[i], real[j]);
-      std::swap(imag[i], imag[j]);
+void fft::transformRadix2(ComplexArray &x, bool inverse) {
+    int levels = 0;
+    size_t n = x.size();
+    double *real = x.get_real_ptr();
+    double *imag = x.get_imag_ptr();
+
+    for (size_t temp = n; temp > 1U; temp >>= 1) levels++;
+    if (static_cast<size_t>(1U) << levels != n)
+        throw std::domain_error("Length is not a power of 2");
+
+    ComplexArray expTable = mkExpTable(n, inverse);
+#pragma omp parallel for
+    for (size_t i = 0; i < n; i++) {
+        size_t j = reverseBits(i, levels);
+        if (j > i) {
+            std::swap(real[i], real[j]);
+            std::swap(imag[i], imag[j]);
+        }
     }
-  }
-
-  double *real_expTable = expTable.get_real_ptr();
-  double *imag_expTable = expTable.get_imag_ptr();
-
-  for (size_t size = 2; size <= n; size *= 2) {
-    size_t halfsize = size / 2;
-    size_t tablestep = n / size;
-    for (size_t i = 0; i < n; i += size) {
-      for (size_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) {
-        double temp_real = real[j + halfsize] * real_expTable[k] -
-                           imag[j + halfsize] * imag_expTable[k];
-        double temp_imag = real[j + halfsize] * imag_expTable[k] +
-                           imag[j + halfsize] * real_expTable[k];
-        real[j + halfsize] = real[j] - temp_real;
-        imag[j + halfsize] = imag[j] - temp_imag;
-        real[j] += temp_real;
-        imag[j] += temp_imag;
-      }
+
+    double *real_expTable = expTable.get_real_ptr();
+    double *imag_expTable = expTable.get_imag_ptr();
+
+    for (size_t size = 2; size <= n; size *= 2) {
+        size_t halfsize = size / 2;
+        size_t tablestep = n / size;
+#pragma omp parallel for
+        for (size_t i = 0; i < n; i += size) {
+            for (size_t j = i, k = 0; j < i + halfsize; j++, k += tablestep) {
+                double temp_real = real[j + halfsize] * real_expTable[k] -
+                                   imag[j + halfsize] * imag_expTable[k];
+                double temp_imag = real[j + halfsize] * imag_expTable[k] +
+                                   imag[j + halfsize] * real_expTable[k];
+                real[j + halfsize] = real[j] - temp_real;
+                imag[j + halfsize] = imag[j] - temp_imag;
+                real[j] += temp_real;
+                imag[j] += temp_imag;
+            }
+        }
+        if (size == n) break;
     }
-    if (size == n)
-      break;
-  }
 }
 
-void fft::transformBluestein(ComplexArray &x, bool inverse, bool simd) {
-  size_t n = x.size();
-  double *real = x.get_real_ptr();
-  double *imag = x.get_imag_ptr();
-  size_t m = 1;
-  while (m / 2 <= n) {
-    if (m > SIZE_MAX / 2)
-      throw std::length_error("Vector too large");
-    m *= 2;
-  }
-
-  // Trigonometric table
-  ComplexArray expTable(n);
-  double *real_expTable = expTable.get_real_ptr();
-  double *imag_expTable = expTable.get_imag_ptr();
-  for (size_t i = 0; i < n; i++) {
-    uintmax_t temp = static_cast<uintmax_t>(i) * i;
-    temp %= static_cast<uintmax_t>(n) * 2;
-    double angle = (inverse ? M_PI : -M_PI) * temp / n;
-    expTable[i] = std::polar(1.0, angle);
-  }
-  ComplexArray a(m), b(m);
-  double *real_avec = a.get_real_ptr();
-  double *imag_avec = a.get_imag_ptr();
-  
-  
-  // TODO: Use SIMD
-  for (size_t i = 0; i < n; i++) {
-    real_avec[i] = real[i] * real_expTable[i] - imag[i] * imag_expTable[i];
-    imag_avec[i] = real[i] * imag_expTable[i] + imag[i] * real_expTable[i];
-    // a[i] = Complex(x[i]) * Complex(expTable[i]);
-  }
-  
-  double *real_bvec = b.get_real_ptr();
-  double *imag_bvec = b.get_imag_ptr();
-  real_bvec[0] = real_expTable[0];
-  imag_bvec[0] = imag_expTable[0];
-  
-  for (size_t i = 1; i < n; ++i) {
-    real_bvec[i] = real_bvec[m - i] = real_expTable[i];
-    imag_bvec[i] = imag_bvec[m - i] = -imag_expTable[i];
-  }
-  
-  convolove(a, b, simd);
-  // x = c * expTable;
-  for (size_t i = 0; i < n; i++) {
-    real[i] = a.get_real_ptr()[i] * real_expTable[i] - a.get_imag_ptr()[i] * imag_expTable[i];
-    imag[i] = a.get_real_ptr()[i] * imag_expTable[i] + a.get_imag_ptr()[i] * real_expTable[i];
-  }
+static inline void compute_mul_exp(size_t n, const double *real,
+                                   const double *imag,
+                                   const double *real_expTable,
+                                   const double *imag_expTable,
+                                   double *real_avec, double *imag_avec,
+                                   bool simd) {
+    if (simd) {
+// Process in chunks of 4 using AVX2
+#pragma omp parallel for
+        for (size_t i = 0; i <= n - 4; i += 4) {
+            // Load 4 elements from each array
+            __m256d real_vec = _mm256_loadu_pd(&real[i]);
+            __m256d imag_vec = _mm256_loadu_pd(&imag[i]);
+            __m256d real_exp_vec = _mm256_loadu_pd(&real_expTable[i]);
+            __m256d imag_exp_vec = _mm256_loadu_pd(&imag_expTable[i]);
+
+            // Compute real_avec = real * real_expTable - imag * imag_expTable
+            __m256d real_result =
+                _mm256_sub_pd(_mm256_mul_pd(real_vec, real_exp_vec),
+                              _mm256_mul_pd(imag_vec, imag_exp_vec));
+
+            // Compute imag_avec = real * imag_expTable + imag * real_expTable
+            __m256d imag_result =
+                _mm256_add_pd(_mm256_mul_pd(real_vec, imag_exp_vec),
+                              _mm256_mul_pd(imag_vec, real_exp_vec));
+
+            // Store the results back to memory
+            _mm256_storeu_pd(&real_avec[i], real_result);
+            _mm256_storeu_pd(&imag_avec[i], imag_result);
+        }
+
+        // Handle remaining elements (n % 4 != 0)
+        for (size_t i = n - n % 4; i < n; i++) {
+            real_avec[i] =
+                real[i] * real_expTable[i] - imag[i] * imag_expTable[i];
+            imag_avec[i] =
+                real[i] * imag_expTable[i] + imag[i] * real_expTable[i];
+        }
+    } else {
+#pragma omp parallel for
+        for (size_t i = 0; i < n; i++) {
+            real_avec[i] =
+                real[i] * real_expTable[i] - imag[i] * imag_expTable[i];
+            imag_avec[i] =
+                real[i] * imag_expTable[i] + imag[i] * real_expTable[i];
+        }
+    }
 }
 
+static inline void setup_a_b(size_t n, size_t m, const double *real_expTable,
+                        const double *imag_expTable, double *real_bvec,
+                        double *imag_bvec, bool simd) {
+    real_bvec[0] = real_expTable[0];
+    imag_bvec[0] = imag_expTable[0];
+
+    if (false) {
+#pragma omp parallel for
+        for (size_t i = 1; i <= n - 4; i += 4) {
+            __m256d real_exp_vec = _mm256_loadu_pd(&real_expTable[i]);
+            __m256d imag_exp_vec = _mm256_loadu_pd(&imag_expTable[i]);
+
+            _mm256_storeu_pd(&real_bvec[i], real_exp_vec);
+            _mm256_storeu_pd(
+                &real_bvec[m - i - 3],
+                _mm256_permute4x64_pd(real_exp_vec, _MM_SHUFFLE(0, 1, 2, 3)));
+
+            __m256d neg_imag_exp_vec =
+                _mm256_sub_pd(_mm256_setzero_pd(), imag_exp_vec);
+            _mm256_storeu_pd(&imag_bvec[i], neg_imag_exp_vec);
+            _mm256_storeu_pd(&imag_bvec[m - i - 3],
+                             _mm256_permute4x64_pd(neg_imag_exp_vec,
+                                                   _MM_SHUFFLE(0, 1, 2, 3)));
+        }
+
+        // Handle remaining elements (n % 4 != 0)
+        for (size_t i = n - n % 4; i < n; ++i) {
+            real_bvec[i] = real_bvec[m - i] = real_expTable[i];
+            imag_bvec[i] = imag_bvec[m - i] = -imag_expTable[i];
+        }
+    } else {
+#pragma omp for
+        for (size_t i = 1; i < n; ++i) {
+            real_bvec[i] = real_bvec[m - i] = real_expTable[i];
+            imag_bvec[i] = imag_bvec[m - i] = -imag_expTable[i];
+        }
+    }
+}
+
+void fft::transformBluestein(ComplexArray &x, bool inverse) {
+    size_t n = x.size();
+    double *real = x.get_real_ptr();
+    double *imag = x.get_imag_ptr();
+    size_t m = 1;
+    while (m / 2 <= n) {
+        if (m > SIZE_MAX / 2) throw std::length_error("Vector too large");
+        m *= 2;
+    }
+
+    // Trigonometric table
+    ComplexArray expTable(n);
+    double *real_expTable = expTable.get_real_ptr();
+    double *imag_expTable = expTable.get_imag_ptr();
+#pragma omp parallel for
+    for (size_t i = 0; i < n; i++) {
+        uintmax_t temp = static_cast<uintmax_t>(i) * i;
+        temp %= static_cast<uintmax_t>(n) * 2;
+        double angle = (inverse ? M_PI : -M_PI) * temp / n;
+        expTable[i] = std::polar(1.0, angle);
+    }
+    ComplexArray a(m), b(m);
+    double *real_avec = a.get_real_ptr();
+    double *imag_avec = a.get_imag_ptr();
+
+    // TODO: Use SIMD
+    compute_mul_exp(n, real, imag, real_expTable, imag_expTable, real_avec,
+                    imag_avec, simd);
+
+    double *real_bvec = b.get_real_ptr();
+    double *imag_bvec = b.get_imag_ptr();
+    setup_a_b(n, m, real_expTable, imag_expTable, real_bvec, imag_bvec, simd);
+
+    convolove(a, b);
+    // x = c * expTable;
+
+    // for (size_t i = 0; i < n; i++) {
+    //     real[i] = a.get_real_ptr()[i] * real_expTable[i] -
+    //               a.get_imag_ptr()[i] * imag_expTable[i];
+    //     imag[i] = a.get_real_ptr()[i] * imag_expTable[i] +
+    //               a.get_imag_ptr()[i] * real_expTable[i];
+    // }
+    compute_mul_exp(n, real_avec, imag_avec, real_expTable, imag_expTable, real,
+                    imag, simd);
+}
 
 // compute the convolution of two complex arrays and set the result to x
-void fft::convolove(ComplexArray &_x, const ComplexArray &_y, bool simd) {
-  size_t n = _x.size();
-  if (n != _y.size())
-    throw std::domain_error("Mismatched lengths");
-  
-  ComplexArray& x = _x;
-  ComplexArray y = _y;
-
-  transform(x, false, simd);
-  transform(y, false, simd);
-
-  x.multiply(y, simd);
-  transform(x, true, simd);
-  x.scale(1.0 / n, simd);
+void fft::convolove(ComplexArray &_x, const ComplexArray &_y) {
+    size_t n = _x.size();
+    if (n != _y.size()) throw std::domain_error("Mismatched lengths");
+
+    ComplexArray &x = _x;
+    ComplexArray y = _y;
+
+    transform(x, false);
+    transform(y, false);
+
+    x.multiply(y, simd);
+    transform(x, true);
+    x.scale(1.0 / n, simd);
 }
 
-void fft::transform(ComplexArray &x, bool inverse, bool simd) {
-  size_t n = x.size();
-  if (n == 0)
-    return;
-  else if ((n & (n - 1)) == 0) // Is power of 2
-    transformRadix2(x, inverse, simd);
-  else // More complicated algorithm for arbitrary sizes
-    transformBluestein(x, inverse, simd);
+void fft::transform(ComplexArray &x, bool inverse) {
+    size_t n = x.size();
+    if (n == 0)
+        return;
+    else if ((n & (n - 1)) == 0)  // Is power of 2
+        transformRadix2(x, inverse);
+    else  // More complicated algorithm for arbitrary sizes
+        transformBluestein(x, inverse);
 }
 
 static size_t reverseBits(size_t val, int width) {
-  size_t result = 0;
-  for (int i = 0; i < width; i++, val >>= 1)
-    result = (result << 1) | (val & 1U);
-  return result;
+    auto n = val;
+#if __SIZEOF_SIZE_T__ == 8
+    n = ((n & 0xffffffff00000000ULL) >> 32) |
+        ((n & 0x00000000ffffffffULL) << 32);
+    n = ((n & 0xffff0000ffff0000ULL) >> 16) |
+        ((n & 0x0000ffff0000ffffULL) << 16);
+    n = ((n & 0xff00ff00ff00ff00ULL) >> 8) | ((n & 0x00ff00ff00ff00ffULL) << 8);
+    n = ((n & 0xf0f0f0f0f0f0f0f0ULL) >> 4) | ((n & 0x0f0f0f0f0f0f0f0fULL) << 4);
+    n = ((n & 0xccccccccccccccccULL) >> 2) | ((n & 0x3333333333333333ULL) << 2);
+    n = ((n & 0xaaaaaaaaaaaaaaaaULL) >> 1) | ((n & 0x5555555555555555ULL) << 1);
+    return n >> (64 - width);
+#elif __SIZEOF_SIZE_T__ == 4
+    n = ((n & 0xffff0000) >> 16) | ((n & 0x0000ffff) << 16);
+    n = ((n & 0xff00ff00) >> 8) | ((n & 0x00ff00ff) << 8);
+    n = ((n & 0xf0f0f0f0) >> 4) | ((n & 0x0f0f0f0f) << 4);
+    n = ((n & 0xcccccccc) >> 2) | ((n & 0x33333333) << 2);
+    n = ((n & 0xaaaaaaaa) >> 1) | ((n & 0x55555555) << 1);
+    return n >> (32 - width);
+#endif
 }
+
+// static size_t reverseBits(size_t val, int width) {
+//   size_t result = 0;
+//   for (int i = 0; i < width; i++, val >>= 1)
+//     result = (result << 1) | (val & 1U);
+//   return result;
+// }
\ No newline at end of file
diff --git a/src/fft.hpp b/src/fft.hpp
index 868eee7..d64a452 100644
--- a/src/fft.hpp
+++ b/src/fft.hpp
@@ -3,11 +3,19 @@
 #include "complex_array.hpp"
 
 namespace fft {
-void transformRadix2(ComplexArray &x, bool inverse, bool simd = false);
 
-void transformBluestein(ComplexArray &x, bool inverse, bool simd = false);
+static bool simd = false;
 
-void transform(ComplexArray &x, bool inverse, bool simd = false);
+void transformRadix2(ComplexArray &x, bool inverse);
+
+void transformBluestein(ComplexArray &x, bool inverse);
+
+void transform(ComplexArray &x, bool inverse);
+
+void convolove(ComplexArray &x, const ComplexArray &y);
+
+void set_num_threads(size_t n);
+
+void set_simd(bool s);
 
-void convolove(ComplexArray &x, const ComplexArray &y, bool simd = false);
 } // namespace fft
diff --git a/test_dct.py b/test_dct.py
index 092c6f7..d67a68a 100644
--- a/test_dct.py
+++ b/test_dct.py
@@ -3,26 +3,62 @@
 import fdct
 import scipy.fft as fft
 import time
+import os
 
 # Parameters
-test_size_seq = [10, 64, 127, 256, 1023, 2048, 4097, 8192, 8193]
-tolerance = 1e-6  # Tolerance for individual element comparison
-simd = [False, True]  # SIMD flag
+test_size_seq_power_of_2 = [65536, 131072, 262144, 524288, 1048576, 2097152]
+test_size_seq_odd = [x + 1 for x in test_size_seq_power_of_2]
+
+tolerance = 1e-6  # Tolerance for element comparison
+num_threads = [1, 2, 4, 8]
+simd = [True, False]
+
+# Combined parameter set: (size, is_power_of_two)
+combined_sizes = [(size, True) for size in test_size_seq_power_of_2] + \
+                 [(size, False) for size in test_size_seq_odd]
 
 # List of test functions and their corresponding references
 test_cases = [
-    ("dct_naive", fdct.dct_naive, fft.dct),
-    ("idct_naive", fdct.idct_naive, fft.idct),
+    # ("dct_naive", fdct.dct_naive, fft.dct),
+    # ("idct_naive", fdct.idct_naive, fft.idct),
     ("fdct", fdct.fdct, fft.dct),
     ("ifdct", fdct.ifdct, fft.idct),
 ]
 
-@pytest.mark.parametrize("size", test_size_seq)
+def write_csv_result(filename, header, row):
+    """Write a single row of results to a CSV file, adding a header if the file doesn't exist."""
+    file_exists = os.path.isfile(filename)
+    with open(filename, "a", newline="") as f:
+        if not file_exists:
+            f.write(header + "\n")
+        f.write(row + "\n")
+
+@pytest.mark.parametrize("size,is_power_of_two", combined_sizes)
 @pytest.mark.parametrize("name, func, ref_func", test_cases)
+@pytest.mark.parametrize("num_thread", num_threads)
 @pytest.mark.parametrize("simd", simd)
-def test_transform(name, func, ref_func, size, simd):
+def test_transform(name, func, ref_func, size, is_power_of_two, num_thread, simd):
+    # Generate input data
     inp = np.random.rand(size)
+
+    # Set number of threads for fdct
+    fdct.set_num_threads(num_thread)
+    
+    fdct.set_simd(simd)
+
+    # Perform reference transform (ground truth)
     gt = ref_func(inp)
-    ans = func(inp, simd)
+
+    # Measure execution time
+    start_time = time.time()
+    ans = func(inp)
+    duration = time.time() - start_time
+
+    # Verify correctness
     np.testing.assert_allclose(ans, gt, atol=tolerance, rtol=0)
 
+    # Write results to a single CSV file
+    filename = "combined_results.csv"
+    header = "function_name,size,num_threads,simd,is_power_of_two,duration_s"
+    row = f"{name},{size},{num_thread},{simd},{is_power_of_two},{duration}"
+    write_csv_result(filename, header, row)
\ No newline at end of file