From d1f676c56b000194829831627c3edbcee78b8882 Mon Sep 17 00:00:00 2001
From: Clint Banzhaf <clint@banzhaf.pro>
Date: Sun, 21 Dec 2025 04:21:03 +0100
Subject: [PATCH 1/8] wip

---
 include/CppCore/Math/Util.h |  52 ++++++++++++++++
 src/CppCore.Debug/main.cpp  | 115 ++++++++++++++++++++++++++++++++++++
 2 files changed, 167 insertions(+)
diff --git a/include/CppCore/Math/Util.h b/include/CppCore/Math/Util.h
index c1aaf0a1..7f8bc960 100644
--- a/include/CppCore/Math/Util.h
+++ b/include/CppCore/Math/Util.h
@@ -3812,6 +3812,58 @@ namespace CppCore
       CppCore::upowmod(a, b, m, r, t);
    }
 
+   template<typename UINT>
+   INLINE static void upowmod_kary(UINT base, UINT exp, const UINT& mod, UINT& result, uint32_t k = 4U)
+   {
+      if (mod == 1) { CppCore::clear(result); return; }
+      if (exp == 0) { result = 1; return; }
+
+      base %= mod;
+
+      // Precompute powers: base^0, base^1, ..., base^(2^k - 1)
+      uint32_t table_size = 1 << k;
+      std::vector<UINT> powers(table_size);
+      powers[0] = 1;
+      powers[1] = base;
+
+      for (uint32_t i = 2; i < table_size; i++) {
+         CppCore::umulmod(powers[i-1], base, mod, powers[i]);
+      }
+
+      // Find the position of the highest bit in exp
+      constexpr auto NUMBITS = sizeof(UINT) * 8U;
+      const auto LZB = CppCore::lzcnt(exp);
+      const auto bits = NUMBITS - LZB;
+
+      // Round up to multiple of k
+      const int total_bits = ((bits + k - 1) / k) * k;
+      //int total_bits = CppCore::rup32(bits, k);
+      //int total_bits = CppCore::rupptwo32(bits, k);
+
+      const uint32_t MASK = ((1 << k) - 1);
+
+      result = 1;
+
+      // Process k bits at a time from left to right (MSB to LSB)
+      for (int pos = total_bits - k; pos >= 0; pos -= k) {
+         // Square result k times (except for the first iteration)
+         if (pos < total_bits - k) {
+            for (int i = 0; i < k; i++) {
+               CppCore::umulmod(result, result, mod, result);
+               //result = (__uint128_t)result * result % mod;
+            }
+         }
+
+         // Extract k-bit chunk at position pos
+         int chunk = (exp >> pos) & MASK;
+
+         // Multiply by base^chunk
+         //result = (__uint128_t)result * powers[chunk] % mod;
+         CppCore::umulmod(result, powers[chunk], mod, result);
+      }
+      //return result;
+   }
+
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // MERSENNE NUMBERS (2^i)-1
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/CppCore.Debug/main.cpp b/src/CppCore.Debug/main.cpp
index 1277fce3..f6f75861 100644
--- a/src/CppCore.Debug/main.cpp
+++ b/src/CppCore.Debug/main.cpp
@@ -1,8 +1,123 @@
 #include <CppCore/Root.h>
+#include <CppCore/Math/Util.h>
+#include <CppCore/Random.h>
+#include <CppCore/Math/BigInt.h>
+#include <CppCore/Math/Primes.h>
+#include <CppCore/Stopwatch.h>
 
 using namespace CppCore;
 
+typedef uint64_t UINTX;
+
+#define N 1024*1024
+
+CppCore::Random::Default64 prng;
+
+/*UINTX d;
+UINTX a[N];
+UINTX r1[N];
+UINTX r2[N];
+
+UINTX bc[2];
+
+uint64_t NOINLINE test1()
+{
+   barrett_constant(bc, d);
+
+   uint64_t t = __rdtsc();
+
+   for (size_t i = 0; i < N; i++)
+      barrett_umod(r1[i], a[i], bc, d);
+
+   return __rdtsc() - t;
+}
+
+uint64_t NOINLINE test2()
+{
+   uint64_t t = __rdtsc();
+
+   for (size_t i = 0; i < N; i++)
+      CppCore::umod(r2[i], a[i], d);
+
+   return __rdtsc() - t;
+}
+
 int main()
 {
+   prng.fill(a);
+   prng.fill(d);
+
+   for (size_t i = 0; i < N; i++)
+      ((uint8_t*)&a[i])[sizeof(UINTX) - 1] |= 0x80;
+
+   uint64_t t2 = test2();
+   uint64_t t1 = test1();
+
+   std::cout << "BARRETT: " << t1 << std::endl;
+   std::cout << "KNUTH:   " << t2 << std::endl;
+
+   for (size_t i = 0; i < N; i++)
+   {
+      if (r1[i] != r2[i])
+      {
+         std::cout << "FAIL" << std::endl;
+         //std::cout << std::hex << r2.toHexString() << std::endl;
+         return 1;
+      }
+   }
+
+   return 0;
+}*/
+
+UINTX a[N];
+UINTX b[N];
+UINTX m[N];
+UINTX r1[N];
+UINTX r2[N];
+
+int main()
+{
+   uint64_t ta = 245373463472;
+   uint64_t tb = 3252764873;
+   uint64_t tm = 32534684;
+   uint64_t tr;
+   CppCore::upowmod_kary(ta, tb, tm, tr);
+   std::cout << tr << std::endl;
+
+   CppCore::upowmod(ta, tb, tm, tr);
+   std::cout << tr << std::endl;
+
+
+   prng.fill(a);
+   prng.fill(b);
+   prng.fill(m);
+
+   for (size_t i = 0; i < N; i++)
+   {
+      UINTX at;
+      CppCore::clone(at, a[i]);
+      CppCore::upowmod(at, b[i], m[i], r1[i]);
+      CppCore::upowmod_kary(a[i], b[i], m[i], r2[i]);
+
+      if (!CppCore::equal(r1[i], r2[i])) {
+         std::cout << a[i] << "^" << b[i] << " % " << m[i] << " = " << r1[i] << std::endl;
+         std::cout << a[i] << "^" << b[i] << " % " << m[i] << " = " << r2[i] << std::endl;
+         return 1;
+      }
+   }
    return 0;
 }
+
+
+
+
+//std::cout << std::setfill('0') << std::setw(5) << std::setprecision(10);
+
+/*double t = 0.5;
+double z;
+for (size_t i = 0; i < Primes::NUMODDPRIMES; i++)
+{
+   z = (1.0 - 1.0 / (double)Primes::ODDPRIMES[i]);
+   t *= z;
+   std::cout << std::setfill('0') << std::setw(5) << i << "  " << Primes::ODDPRIMES[i] << "  " << std::fixed << std::setprecision(10) << (1.0 - t) << std::endl;
+}*/
\ No newline at end of file

From 3cb7743ab39cb7110107ff813884d3840d80ba76 Mon Sep 17 00:00:00 2001
From: Clint Banzhaf <clint@banzhaf.pro>
Date: Sat, 3 Jan 2026 15:56:39 +0100
Subject: [PATCH 2/8] wip

---
 include/CppCore/Math/Util.h | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/include/CppCore/Math/Util.h b/include/CppCore/Math/Util.h
index f880745f..2922b9a7 100644
--- a/include/CppCore/Math/Util.h
+++ b/include/CppCore/Math/Util.h
@@ -3855,55 +3855,50 @@ namespace CppCore
    }
 
    template<typename UINT>
-   INLINE static void upowmod_kary(UINT base, UINT exp, const UINT& mod, UINT& result, uint32_t k = 4U)
+   INLINE static void upowmod_kary(UINT& base, const UINT& exp, const UINT& m, UINT& result, uint32_t k = 4U)
    {
-      if (mod == 1) { CppCore::clear(result); return; }
-      if (exp == 0) { result = 1; return; }
+      //assert((&a != &r) && (&b != &r) && (&m != &r));
+      assert(!CppCore::testzero(m));
+      CppCore::clear(result);
+      if (m == 1U)
+         return; // n % 1 = 0
+      *(uint32_t*)&result = 1U;
+      if (CppCore::testzero(exp))
+         return; // 1 % n = 1
 
-      base %= mod;
+      base %= m;
 
       // Precompute powers: base^0, base^1, ..., base^(2^k - 1)
       uint32_t table_size = 1 << k;
       std::vector<UINT> powers(table_size);
       powers[0] = 1;
       powers[1] = base;
-
       for (uint32_t i = 2; i < table_size; i++) {
-         CppCore::umulmod(powers[i-1], base, mod, powers[i]);
+         CppCore::umulmod(powers[i-1], base, m, powers[i]);
       }
 
       // Find the position of the highest bit in exp
       constexpr auto NUMBITS = sizeof(UINT) * 8U;
       const auto LZB = CppCore::lzcnt(exp);
       const auto bits = NUMBITS - LZB;
-
       // Round up to multiple of k
       const int total_bits = ((bits + k - 1) / k) * k;
-      //int total_bits = CppCore::rup32(bits, k);
-      //int total_bits = CppCore::rupptwo32(bits, k);
-
-      const uint32_t MASK = ((1 << k) - 1);
-
-      result = 1;
 
       // Process k bits at a time from left to right (MSB to LSB)
       for (int pos = total_bits - k; pos >= 0; pos -= k) {
+
          // Square result k times (except for the first iteration)
          if (pos < total_bits - k) {
             for (int i = 0; i < k; i++) {
-               CppCore::umulmod(result, result, mod, result);
-               //result = (__uint128_t)result * result % mod;
+               CppCore::umulmod(result, result, m, result);
             }
          }
 
          // Extract k-bit chunk at position pos
-         int chunk = (exp >> pos) & MASK;
-
+         const uint32_t chunk = CppCore::getbits32(exp, pos, k);
          // Multiply by base^chunk
-         //result = (__uint128_t)result * powers[chunk] % mod;
-         CppCore::umulmod(result, powers[chunk], mod, result);
+         CppCore::umulmod(result, powers[chunk], m, result);
       }
-      //return result;
    }
 
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

From ac159c7383816d8f5bf2f3b20de5359bf9711078 Mon Sep 17 00:00:00 2001
From: Clint Banzhaf <clint@banzhaf.pro>
Date: Sat, 3 Jan 2026 16:32:08 +0100
Subject: [PATCH 3/8] wip

---
 include/CppCore/Math/Util.h | 40 ++++++++++++++++--------------------
 src/CppCore.Debug/main.cpp  | 41 +++++++++++++++++++++++++------------
 2 files changed, 46 insertions(+), 35 deletions(-)

diff --git a/include/CppCore/Math/Util.h b/include/CppCore/Math/Util.h
index 2922b9a7..25ba2826 100644
--- a/include/CppCore/Math/Util.h
+++ b/include/CppCore/Math/Util.h
@@ -3854,50 +3854,46 @@ namespace CppCore
       CppCore::upowmod(a, b, m, r, t);
    }
 
-   template<typename UINT>
-   INLINE static void upowmod_kary(UINT& base, const UINT& exp, const UINT& m, UINT& result, uint32_t k = 4U)
+   template<typename UINT, uint32_t K = 4U>
+   INLINE static void upowmod_kary(UINT& base, const UINT& exp, const UINT& m, UINT& r)
    {
       //assert((&a != &r) && (&b != &r) && (&m != &r));
       assert(!CppCore::testzero(m));
-      CppCore::clear(result);
+      CppCore::clear(r);
       if (m == 1U)
          return; // n % 1 = 0
-      *(uint32_t*)&result = 1U;
+      *(uint32_t*)&r = 1U;
       if (CppCore::testzero(exp))
          return; // 1 % n = 1
 
-      base %= m;
+      base %= m;//TODO
 
       // Precompute powers: base^0, base^1, ..., base^(2^k - 1)
-      uint32_t table_size = 1 << k;
-      std::vector<UINT> powers(table_size);
-      powers[0] = 1;
+      constexpr size_t TABLE_SIZE = 1U << K;
+      UINT powers[TABLE_SIZE];
+      powers[0] = 1U;
       powers[1] = base;
-      for (uint32_t i = 2; i < table_size; i++) {
+      for (size_t i = 2; i < TABLE_SIZE; i++)
          CppCore::umulmod(powers[i-1], base, m, powers[i]);
-      }
 
       // Find the position of the highest bit in exp
+      // Round up to multiple of k
       constexpr auto NUMBITS = sizeof(UINT) * 8U;
       const auto LZB = CppCore::lzcnt(exp);
-      const auto bits = NUMBITS - LZB;
-      // Round up to multiple of k
-      const int total_bits = ((bits + k - 1) / k) * k;
+      const auto HIDX = NUMBITS - LZB;
+      const auto HIDX_K = ((HIDX + K - 1U) / K) * K;
 
       // Process k bits at a time from left to right (MSB to LSB)
-      for (int pos = total_bits - k; pos >= 0; pos -= k) {
+      for (int pos = HIDX_K - K; pos >= 0; pos -= K) {
 
          // Square result k times (except for the first iteration)
-         if (pos < total_bits - k) {
-            for (int i = 0; i < k; i++) {
-               CppCore::umulmod(result, result, m, result);
+         if (pos < HIDX_K - K) {
+            for (size_t i = 0; i < K; i++) {
+               CppCore::umulmod(r, r, m, r);
             }
          }
-
-         // Extract k-bit chunk at position pos
-         const uint32_t chunk = CppCore::getbits32(exp, pos, k);
-         // Multiply by base^chunk
-         CppCore::umulmod(result, powers[chunk], m, result);
+         const uint32_t CHUNK = CppCore::getbits32(exp, pos, K);
+         CppCore::umulmod(r, powers[CHUNK], m, r);
       }
    }
 
diff --git a/src/CppCore.Debug/main.cpp b/src/CppCore.Debug/main.cpp
index f6f75861..87743f39 100644
--- a/src/CppCore.Debug/main.cpp
+++ b/src/CppCore.Debug/main.cpp
@@ -7,9 +7,9 @@
 
 using namespace CppCore;
 
-typedef uint64_t UINTX;
+typedef uint1024_t UINTX;
 
-#define N 1024*1024
+#define N 1024
 
 CppCore::Random::Default64 prng;
 
@@ -69,15 +69,17 @@ int main()
    return 0;
 }*/
 
-UINTX a[N];
+UINTX a1[N];
+UINTX a2[N];
 UINTX b[N];
 UINTX m[N];
 UINTX r1[N];
 UINTX r2[N];
+//UINTX at;
 
 int main()
 {
-   uint64_t ta = 245373463472;
+   /*uint64_t ta = 245373463472;
    uint64_t tb = 3252764873;
    uint64_t tm = 32534684;
    uint64_t tr;
@@ -85,26 +87,39 @@ int main()
    std::cout << tr << std::endl;
 
    CppCore::upowmod(ta, tb, tm, tr);
-   std::cout << tr << std::endl;
+   std::cout << tr << std::endl;*/
 
 
-   prng.fill(a);
+
+   prng.fill(a1);
    prng.fill(b);
    prng.fill(m);
+   CppCore::clone(a2, a1);
 
+   uint64_t t = __rdtsc();
    for (size_t i = 0; i < N; i++)
    {
-      UINTX at;
-      CppCore::clone(at, a[i]);
-      CppCore::upowmod(at, b[i], m[i], r1[i]);
-      CppCore::upowmod_kary(a[i], b[i], m[i], r2[i]);
-
+      /*if (i % 1024 == 0) {
+         std::cout << i << std::endl;
+      }*/
+      CppCore::upowmod(a1[i], b[i], m[i], r1[i]);
+      CppCore::upowmod_kary(a2[i], b[i], m[i], r2[i]);
       if (!CppCore::equal(r1[i], r2[i])) {
-         std::cout << a[i] << "^" << b[i] << " % " << m[i] << " = " << r1[i] << std::endl;
-         std::cout << a[i] << "^" << b[i] << " % " << m[i] << " = " << r2[i] << std::endl;
+         std::cout << a1[i] << "^" << b[i] << " % " << m[i] << " = " << r1[i] << std::endl;
+         std::cout << a2[i] << "^" << b[i] << " % " << m[i] << " = " << r2[i] << std::endl;
          return 1;
       }
    }
+   t = __rdtsc() - t;
+   std::cout << t << std::endl;
+
+   UINTX r = 0;
+   for (size_t i = 0; i < N; i++)
+   {
+      r += r2[i];
+   }
+   std::cout << r << std::endl;
+
    return 0;
 }
 

From 6848e60f74653061e6d616c0a0b870c96d66375a Mon Sep 17 00:00:00 2001
From: Clint Banzhaf <clint@banzhaf.pro>
Date: Sat, 3 Jan 2026 17:03:00 +0100
Subject: [PATCH 4/8] wip

---
 include/CppCore/Math/Util.h | 3 ++-
 src/CppCore.Debug/main.cpp  | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/CppCore/Math/Util.h b/include/CppCore/Math/Util.h
index 25ba2826..87b4625f 100644
--- a/include/CppCore/Math/Util.h
+++ b/include/CppCore/Math/Util.h
@@ -3892,7 +3892,8 @@ namespace CppCore
                CppCore::umulmod(r, r, m, r);
             }
          }
-         const uint32_t CHUNK = CppCore::getbits32(exp, pos, K);
+         const uint32_t N = MIN(K, NUMBITS-pos);
+         const uint32_t CHUNK = CppCore::getbits32(exp, pos, N);
          CppCore::umulmod(r, powers[CHUNK], m, r);
       }
    }
diff --git a/src/CppCore.Debug/main.cpp b/src/CppCore.Debug/main.cpp
index 87743f39..812f7ce0 100644
--- a/src/CppCore.Debug/main.cpp
+++ b/src/CppCore.Debug/main.cpp
@@ -7,7 +7,7 @@
 
 using namespace CppCore;
 
-typedef uint1024_t UINTX;
+typedef uint2048_t UINTX;
 
 #define N 1024
 
@@ -103,7 +103,7 @@ int main()
          std::cout << i << std::endl;
       }*/
       CppCore::upowmod(a1[i], b[i], m[i], r1[i]);
-      CppCore::upowmod_kary(a2[i], b[i], m[i], r2[i]);
+      CppCore::upowmod_kary<UINTX, 5U>(a2[i], b[i], m[i], r2[i]);
       if (!CppCore::equal(r1[i], r2[i])) {
          std::cout << a1[i] << "^" << b[i] << " % " << m[i] << " = " << r1[i] << std::endl;
          std::cout << a2[i] << "^" << b[i] << " % " << m[i] << " = " << r2[i] << std::endl;
@@ -116,7 +116,7 @@ int main()
    UINTX r = 0;
    for (size_t i = 0; i < N; i++)
    {
-      r += r2[i];
+      r += r1[i];
    }
    std::cout << r << std::endl;
 

From b23581dc503eb85690d23351b27ea55062448c0d Mon Sep 17 00:00:00 2001
From: Clint Banzhaf <clint@banzhaf.pro>
Date: Fri, 9 Jan 2026 23:34:33 +0100
Subject: [PATCH 5/8] wip

---
 include/CppCore/Math/Util.h | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/include/CppCore/Math/Util.h b/include/CppCore/Math/Util.h
index 87b4625f..6ea83994 100644
--- a/include/CppCore/Math/Util.h
+++ b/include/CppCore/Math/Util.h
@@ -3854,8 +3854,11 @@ namespace CppCore
       CppCore::upowmod(a, b, m, r, t);
    }
 
+   /// <summary>
+   /// a^b mod m
+   /// </summary>
    template<typename UINT, uint32_t K = 4U>
-   INLINE static void upowmod_kary(UINT& base, const UINT& exp, const UINT& m, UINT& r)
+   INLINE static void upowmod_kary(UINT& base, const UINT& exp, const UINT& m, UINT& r, UINT t[3])
    {
       //assert((&a != &r) && (&b != &r) && (&m != &r));
       assert(!CppCore::testzero(m));
@@ -3866,15 +3869,16 @@ namespace CppCore
       if (CppCore::testzero(exp))
          return; // 1 % n = 1
 
-      base %= m;//TODO
+      //base %= m;//TODO
 
       // Precompute powers: base^0, base^1, ..., base^(2^k - 1)
       constexpr size_t TABLE_SIZE = 1U << K;
       UINT powers[TABLE_SIZE];
-      powers[0] = 1U;
-      powers[1] = base;
+      CppCore::clear(powers[0]);
+      *(uint32_t*)&powers[0] = 1U;
+      CppCore::clone(powers[1], base);
       for (size_t i = 2; i < TABLE_SIZE; i++)
-         CppCore::umulmod(powers[i-1], base, m, powers[i]);
+         CppCore::umulmod(powers[i-1], base, m, powers[i], t);
 
       // Find the position of the highest bit in exp
       // Round up to multiple of k
@@ -3889,15 +3893,25 @@ namespace CppCore
          // Square result k times (except for the first iteration)
          if (pos < HIDX_K - K) {
             for (size_t i = 0; i < K; i++) {
-               CppCore::umulmod(r, r, m, r);
+               CppCore::umulmod(r, r, m, r, t);
             }
          }
          const uint32_t N = MIN(K, NUMBITS-pos);
          const uint32_t CHUNK = CppCore::getbits32(exp, pos, N);
-         CppCore::umulmod(r, powers[CHUNK], m, r);
+         CppCore::umulmod(r, powers[CHUNK], m, r, t);
       }
    }
 
+   /// <summary>
+   /// a^b mod m
+   /// </summary>
+   template<typename UINT, uint32_t K = 4U>
+   INLINE static void upowmod_kary(UINT& a, const UINT& b, const UINT& m, UINT& r)
+   {
+      CPPCORE_ALIGN_OPTIM(UINT) t[3];
+      CppCore::upowmod_kary<UINT, K>(a, b, m, r, t);
+   }
+
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // MERSENNE NUMBERS (2^i)-1
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

From 4258da55e3bada0f3caa61403b9e1dbade0f286b Mon Sep 17 00:00:00 2001
From: Clint Banzhaf <clint@banzhaf.pro>
Date: Sat, 10 Jan 2026 21:24:57 +0100
Subject: [PATCH 6/8] wip

---
 include/CppCore.Test/Math/Util.h | 21 ++++++++++++++
 include/CppCore/Math/Primes.h    |  2 +-
 include/CppCore/Math/Util.h      | 47 ++++++++++++++------------------
 src/CppCore.Debug/main.cpp       | 22 +++++++++++----
 src/CppCore.Test/Test.cpp        |  2 ++
 5 files changed, 61 insertions(+), 33 deletions(-)

diff --git a/include/CppCore.Test/Math/Util.h b/include/CppCore.Test/Math/Util.h
index ab258fb3..112e4362 100644
--- a/include/CppCore.Test/Math/Util.h
+++ b/include/CppCore.Test/Math/Util.h
@@ -894,6 +894,24 @@ namespace CppCore { namespace Test { namespace Math
          return true;
       }
 
+      template<size_t N64=2>
+      INLINE static bool upowmod()
+      {
+         uint64_t a1[N64]; uint64_t b1[N64]; uint64_t m1[N64]; uint64_t r1[N64];
+         uint64_t a2[N64]; uint64_t b2[N64]; uint64_t m2[N64]; uint64_t r2[N64];
+         CppCore::Random::Default64 prng;
+         for (size_t i = 0; i < 100; i++) {
+            prng.fill(a1); CppCore::clone(a2, a1);
+            prng.fill(b1); CppCore::clone(b2, b1);
+            prng.fill(m1); CppCore::clone(m2, m1);
+            CppCore::upowmod_single(a1, b1, m1, r1);
+            CppCore::upowmod(a2, b2, m2, r2);
+            if (!CppCore::equal(r1, r2))
+               return false;
+         }
+         return true;
+      }
+
       INLINE static bool upow32()
       {
          for (uint32_t base = 0; base < 20; base++)
@@ -1553,6 +1571,9 @@ namespace CppCore { namespace Test { namespace VS { namespace Math {
       TEST_METHOD(UMULMOD64)        { Assert::AreEqual(true, CppCore::Test::Math::Util::umulmod64()); }
       TEST_METHOD(UPOWMOD32)        { Assert::AreEqual(true, CppCore::Test::Math::Util::upowmod32()); }
       TEST_METHOD(UPOWMOD64)        { Assert::AreEqual(true, CppCore::Test::Math::Util::upowmod64()); }
+      TEST_METHOD(UPOWMOD128)       { Assert::AreEqual(true, CppCore::Test::Math::Util::upowmod<2>()); }
+      TEST_METHOD(UPOWMOD256)       { Assert::AreEqual(true, CppCore::Test::Math::Util::upowmod<4>()); }
+
       TEST_METHOD(UPOW32)           { Assert::AreEqual(true, CppCore::Test::Math::Util::upow32()); }
       TEST_METHOD(UPOW64)           { Assert::AreEqual(true, CppCore::Test::Math::Util::upow64()); }
       TEST_METHOD(UDIVMOD32)        { Assert::AreEqual(true, CppCore::Test::Math::Util::udivmod32()); }
diff --git a/include/CppCore/Math/Primes.h b/include/CppCore/Math/Primes.h
index 14e9c460..98df104c 100644
--- a/include/CppCore/Math/Primes.h
+++ b/include/CppCore/Math/Primes.h
@@ -122,7 +122,7 @@ namespace CppCore
       /// SPRP with precalculated t, s and d and work memory m
       /// </summary>
       template<typename UINT>
-      INLINE static bool sprp(const UINT& n, UINT& a, const UINT& t, const uint32_t& s, const UINT& d, UINT& r, UINT m[3])
+      INLINE static bool sprp(const UINT& n, const UINT& a, const UINT& t, const uint32_t& s, const UINT& d, UINT& r, UINT m[3])
       {
          assert(a != 1U);
          CppCore::upowmod(a, d, n, r, m);
diff --git a/include/CppCore/Math/Util.h b/include/CppCore/Math/Util.h
index 6ea83994..d6ee6b06 100644
--- a/include/CppCore/Math/Util.h
+++ b/include/CppCore/Math/Util.h
@@ -3822,7 +3822,7 @@ namespace CppCore
    /// a^b mod m
    /// </summary>
    template<typename UINT>
-   INLINE static void upowmod(UINT& a, const UINT& b, const UINT& m, UINT& r, UINT t[3])
+   INLINE static void upowmod_single(UINT& a, const UINT& b, const UINT& m, UINT& r, UINT t[3])
    {
       assert((&a != &r) && (&b != &r) && (&m != &r));
       assert(!CppCore::testzero(m));
@@ -3848,56 +3848,51 @@ namespace CppCore
    /// a^b mod m
    /// </summary>
    template<typename UINT>
-   INLINE static void upowmod(UINT& a, const UINT& b, const UINT& m, UINT& r)
+   INLINE static void upowmod_single(UINT& a, const UINT& b, const UINT& m, UINT& r)
    {
       CPPCORE_ALIGN_OPTIM(UINT) t[3];
-      CppCore::upowmod(a, b, m, r, t);
+      CppCore::upowmod_single(a, b, m, r, t);
    }
 
    /// <summary>
    /// a^b mod m
    /// </summary>
    template<typename UINT, uint32_t K = 4U>
-   INLINE static void upowmod_kary(UINT& base, const UINT& exp, const UINT& m, UINT& r, UINT t[3])
+   INLINE static void upowmod(const UINT& a, const UINT& b, const UINT& m, UINT& r, UINT t[3])
    {
-      //assert((&a != &r) && (&b != &r) && (&m != &r));
+      assert((&a != &r) && (&b != &r) && (&m != &r));
       assert(!CppCore::testzero(m));
       CppCore::clear(r);
-      if (m == 1U)
-         return; // n % 1 = 0
+      constexpr auto NUMBITS = sizeof(UINT)*8U;
+      const auto LZB = CppCore::lzcnt(b);
+      if (LZB == NUMBITS) CPPCORE_UNLIKELY {
+         if (NUMBITS-CppCore::lzcnt(m) != 1U) CPPCORE_LIKELY
+            *(uint32_t*)&r = 1U;
+         return;
+      }
       *(uint32_t*)&r = 1U;
-      if (CppCore::testzero(exp))
-         return; // 1 % n = 1
-
-      //base %= m;//TODO
 
       // Precompute powers: base^0, base^1, ..., base^(2^k - 1)
       constexpr size_t TABLE_SIZE = 1U << K;
-      UINT powers[TABLE_SIZE];
-      CppCore::clear(powers[0]);
-      *(uint32_t*)&powers[0] = 1U;
-      CppCore::clone(powers[1], base);
+      CPPCORE_ALIGN_OPTIM(UINT) powers[TABLE_SIZE];
+      CppCore::clear(powers[0]); *(uint32_t*)&powers[0] = 1U;
+      CppCore::clone(powers[1], a);
       for (size_t i = 2; i < TABLE_SIZE; i++)
-         CppCore::umulmod(powers[i-1], base, m, powers[i], t);
+         CppCore::umulmod(powers[i-1], a, m, powers[i], t);
 
       // Find the position of the highest bit in exp
       // Round up to multiple of k
-      constexpr auto NUMBITS = sizeof(UINT) * 8U;
-      const auto LZB = CppCore::lzcnt(exp);
       const auto HIDX = NUMBITS - LZB;
       const auto HIDX_K = ((HIDX + K - 1U) / K) * K;
 
       // Process k bits at a time from left to right (MSB to LSB)
       for (int pos = HIDX_K - K; pos >= 0; pos -= K) {
 
-         // Square result k times (except for the first iteration)
-         if (pos < HIDX_K - K) {
-            for (size_t i = 0; i < K; i++) {
+         if (pos < HIDX_K - K)
+            for (size_t i = 0; i < K; i++)
                CppCore::umulmod(r, r, m, r, t);
-            }
-         }
          const uint32_t N = MIN(K, NUMBITS-pos);
-         const uint32_t CHUNK = CppCore::getbits32(exp, pos, N);
+         const uint32_t CHUNK = CppCore::getbits32(b, pos, N);
          CppCore::umulmod(r, powers[CHUNK], m, r, t);
       }
    }
@@ -3906,10 +3901,10 @@ namespace CppCore
    /// a^b mod m
    /// </summary>
    template<typename UINT, uint32_t K = 4U>
-   INLINE static void upowmod_kary(UINT& a, const UINT& b, const UINT& m, UINT& r)
+   INLINE static void upowmod(const UINT& a, const UINT& b, const UINT& m, UINT& r)
    {
       CPPCORE_ALIGN_OPTIM(UINT) t[3];
-      CppCore::upowmod_kary<UINT, K>(a, b, m, r, t);
+      CppCore::upowmod<UINT, K>(a, b, m, r, t);
    }
 
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/CppCore.Debug/main.cpp b/src/CppCore.Debug/main.cpp
index 812f7ce0..4ba9c2b0 100644
--- a/src/CppCore.Debug/main.cpp
+++ b/src/CppCore.Debug/main.cpp
@@ -2,7 +2,7 @@
 #include <CppCore/Math/Util.h>
 #include <CppCore/Random.h>
 #include <CppCore/Math/BigInt.h>
-#include <CppCore/Math/Primes.h>
+//#include <CppCore/Math/Primes.h>
 #include <CppCore/Stopwatch.h>
 
 using namespace CppCore;
@@ -90,10 +90,20 @@ int main()
    std::cout << tr << std::endl;*/
 
 
+   /*CppCore::clear(a1);
+   CppCore::clear(b);
+   CppCore::clear(m);
+   for (size_t i = 0; i < N; i++)
+   {
+      *(uint32_t*)&a1[i] = (uint32_t)prng.next();
+      *(uint32_t*)&b[i] = (uint32_t)prng.next();
+      *(uint32_t*)&m[i] = (uint32_t)prng.next();
+   }*/
 
    prng.fill(a1);
    prng.fill(b);
    prng.fill(m);
+
    CppCore::clone(a2, a1);
 
    uint64_t t = __rdtsc();
@@ -102,13 +112,13 @@ int main()
       /*if (i % 1024 == 0) {
          std::cout << i << std::endl;
       }*/
-      CppCore::upowmod(a1[i], b[i], m[i], r1[i]);
-      CppCore::upowmod_kary<UINTX, 5U>(a2[i], b[i], m[i], r2[i]);
-      if (!CppCore::equal(r1[i], r2[i])) {
+      //CppCore::upowmod_single(a1[i], b[i], m[i], r1[i]);
+      CppCore::upowmod<UINTX, 4U>(a2[i], b[i], m[i], r2[i]);
+      /*if (!CppCore::equal(r1[i], r2[i])) {
          std::cout << a1[i] << "^" << b[i] << " % " << m[i] << " = " << r1[i] << std::endl;
          std::cout << a2[i] << "^" << b[i] << " % " << m[i] << " = " << r2[i] << std::endl;
          return 1;
-      }
+      }*/
    }
    t = __rdtsc() - t;
    std::cout << t << std::endl;
@@ -116,7 +126,7 @@ int main()
    UINTX r = 0;
    for (size_t i = 0; i < N; i++)
    {
-      r += r1[i];
+      r += r2[i];
    }
    std::cout << r << std::endl;
 
diff --git a/src/CppCore.Test/Test.cpp b/src/CppCore.Test/Test.cpp
index 8ea3e87c..2d3c88e7 100644
--- a/src/CppCore.Test/Test.cpp
+++ b/src/CppCore.Test/Test.cpp
@@ -378,6 +378,8 @@ int main()
    TEST(CppCore::Test::Math::Util::umulmod64,       "umulmod64:        ", std::endl);
    TEST(CppCore::Test::Math::Util::upowmod32,       "upowmod32:        ", std::endl);
    TEST(CppCore::Test::Math::Util::upowmod64,       "upowmod64:        ", std::endl);
+   TEST(CppCore::Test::Math::Util::upowmod<2>,      "upowmod128:       ", std::endl);
+   TEST(CppCore::Test::Math::Util::upowmod<4>,      "upowmod256:       ", std::endl);
    TEST(CppCore::Test::Math::Util::upow32,          "upow32:           ", std::endl);
    TEST(CppCore::Test::Math::Util::upow64,          "upow64:           ", std::endl);
    TEST(CppCore::Test::Math::Util::udivmod32,       "udivmod32:        ", std::endl);

From 30c366b448c0f81df6a878870fd27a99cf45c9c3 Mon Sep 17 00:00:00 2001
From: Clint Banzhaf <clint@banzhaf.pro>
Date: Sun, 11 Jan 2026 15:18:20 +0100
Subject: [PATCH 7/8] wip

---
 src/CppCore.Debug/main.cpp | 140 -------------------------------------
 1 file changed, 140 deletions(-)

diff --git a/src/CppCore.Debug/main.cpp b/src/CppCore.Debug/main.cpp
index 4ba9c2b0..1277fce3 100644
--- a/src/CppCore.Debug/main.cpp
+++ b/src/CppCore.Debug/main.cpp
@@ -1,148 +1,8 @@
 #include <CppCore/Root.h>
-#include <CppCore/Math/Util.h>
-#include <CppCore/Random.h>
-#include <CppCore/Math/BigInt.h>
-//#include <CppCore/Math/Primes.h>
-#include <CppCore/Stopwatch.h>
 
 using namespace CppCore;
 
-typedef uint2048_t UINTX;
-
-#define N 1024
-
-CppCore::Random::Default64 prng;
-
-/*UINTX d;
-UINTX a[N];
-UINTX r1[N];
-UINTX r2[N];
-
-UINTX bc[2];
-
-uint64_t NOINLINE test1()
-{
-   barrett_constant(bc, d);
-
-   uint64_t t = __rdtsc();
-
-   for (size_t i = 0; i < N; i++)
-      barrett_umod(r1[i], a[i], bc, d);
-
-   return __rdtsc() - t;
-}
-
-uint64_t NOINLINE test2()
-{
-   uint64_t t = __rdtsc();
-
-   for (size_t i = 0; i < N; i++)
-      CppCore::umod(r2[i], a[i], d);
-
-   return __rdtsc() - t;
-}
-
-int main()
-{
-   prng.fill(a);
-   prng.fill(d);
-
-   for (size_t i = 0; i < N; i++)
-      ((uint8_t*)&a[i])[sizeof(UINTX) - 1] |= 0x80;
-
-   uint64_t t2 = test2();
-   uint64_t t1 = test1();
-
-   std::cout << "BARRETT: " << t1 << std::endl;
-   std::cout << "KNUTH:   " << t2 << std::endl;
-
-   for (size_t i = 0; i < N; i++)
-   {
-      if (r1[i] != r2[i])
-      {
-         std::cout << "FAIL" << std::endl;
-         //std::cout << std::hex << r2.toHexString() << std::endl;
-         return 1;
-      }
-   }
-
-   return 0;
-}*/
-
-UINTX a1[N];
-UINTX a2[N];
-UINTX b[N];
-UINTX m[N];
-UINTX r1[N];
-UINTX r2[N];
-//UINTX at;
-
 int main()
 {
-   /*uint64_t ta = 245373463472;
-   uint64_t tb = 3252764873;
-   uint64_t tm = 32534684;
-   uint64_t tr;
-   CppCore::upowmod_kary(ta, tb, tm, tr);
-   std::cout << tr << std::endl;
-
-   CppCore::upowmod(ta, tb, tm, tr);
-   std::cout << tr << std::endl;*/
-
-
-   /*CppCore::clear(a1);
-   CppCore::clear(b);
-   CppCore::clear(m);
-   for (size_t i = 0; i < N; i++)
-   {
-      *(uint32_t*)&a1[i] = (uint32_t)prng.next();
-      *(uint32_t*)&b[i] = (uint32_t)prng.next();
-      *(uint32_t*)&m[i] = (uint32_t)prng.next();
-   }*/
-
-   prng.fill(a1);
-   prng.fill(b);
-   prng.fill(m);
-
-   CppCore::clone(a2, a1);
-
-   uint64_t t = __rdtsc();
-   for (size_t i = 0; i < N; i++)
-   {
-      /*if (i % 1024 == 0) {
-         std::cout << i << std::endl;
-      }*/
-      //CppCore::upowmod_single(a1[i], b[i], m[i], r1[i]);
-      CppCore::upowmod<UINTX, 4U>(a2[i], b[i], m[i], r2[i]);
-      /*if (!CppCore::equal(r1[i], r2[i])) {
-         std::cout << a1[i] << "^" << b[i] << " % " << m[i] << " = " << r1[i] << std::endl;
-         std::cout << a2[i] << "^" << b[i] << " % " << m[i] << " = " << r2[i] << std::endl;
-         return 1;
-      }*/
-   }
-   t = __rdtsc() - t;
-   std::cout << t << std::endl;
-
-   UINTX r = 0;
-   for (size_t i = 0; i < N; i++)
-   {
-      r += r2[i];
-   }
-   std::cout << r << std::endl;
-
    return 0;
 }
-
-
-
-
-//std::cout << std::setfill('0') << std::setw(5) << std::setprecision(10);
-
-/*double t = 0.5;
-double z;
-for (size_t i = 0; i < Primes::NUMODDPRIMES; i++)
-{
-   z = (1.0 - 1.0 / (double)Primes::ODDPRIMES[i]);
-   t *= z;
-   std::cout << std::setfill('0') << std::setw(5) << i << "  " << Primes::ODDPRIMES[i] << "  " << std::fixed << std::setprecision(10) << (1.0 - t) << std::endl;
-}*/
\ No newline at end of file

From 08dc865804873e0471abce0fd85055ebd5c3a5e6 Mon Sep 17 00:00:00 2001
From: Clint Banzhaf <clint@banzhaf.pro>
Date: Sun, 11 Jan 2026 15:20:57 +0100
Subject: [PATCH 8/8] cleanup

---
 include/CppCore.Test/Math/Util.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/CppCore.Test/Math/Util.h b/include/CppCore.Test/Math/Util.h
index 112e4362..5374c681 100644
--- a/include/CppCore.Test/Math/Util.h
+++ b/include/CppCore.Test/Math/Util.h
@@ -1573,7 +1573,6 @@ namespace CppCore { namespace Test { namespace VS { namespace Math {
       TEST_METHOD(UPOWMOD64)        { Assert::AreEqual(true, CppCore::Test::Math::Util::upowmod64()); }
       TEST_METHOD(UPOWMOD128)       { Assert::AreEqual(true, CppCore::Test::Math::Util::upowmod<2>()); }
       TEST_METHOD(UPOWMOD256)       { Assert::AreEqual(true, CppCore::Test::Math::Util::upowmod<4>()); }
-
       TEST_METHOD(UPOW32)           { Assert::AreEqual(true, CppCore::Test::Math::Util::upow32()); }
       TEST_METHOD(UPOW64)           { Assert::AreEqual(true, CppCore::Test::Math::Util::upow64()); }
       TEST_METHOD(UDIVMOD32)        { Assert::AreEqual(true, CppCore::Test::Math::Util::udivmod32()); }