From 9fabf0c57d3b3b4740fc62e05634b534e8ef8d9d Mon Sep 17 00:00:00 2001 From: tompng Date: Sun, 17 Aug 2025 15:07:48 +0900 Subject: [PATCH 1/4] Implement faster multiplication using Number Theoretic Transform Performs ntt with three primes (29<<27|1, 26<<27|1, 24<<27|1) --- bigdecimal.gemspec | 1 + ext/bigdecimal/bigdecimal.c | 39 ++++++ ext/bigdecimal/ntt.h | 191 +++++++++++++++++++++++++++ test/bigdecimal/test_vp_operation.rb | 13 ++ 4 files changed, 244 insertions(+) create mode 100644 ext/bigdecimal/ntt.h diff --git a/bigdecimal.gemspec b/bigdecimal.gemspec index b6ef8fd9..2c1550cd 100644 --- a/bigdecimal.gemspec +++ b/bigdecimal.gemspec @@ -46,6 +46,7 @@ Gem::Specification.new do |s| ext/bigdecimal/feature.h ext/bigdecimal/missing.c ext/bigdecimal/missing.h + ext/bigdecimal/ntt.h ext/bigdecimal/missing/dtoa.c ext/bigdecimal/static_assert.h ] diff --git a/ext/bigdecimal/bigdecimal.c b/ext/bigdecimal/bigdecimal.c index 6f4249a4..21dde775 100644 --- a/ext/bigdecimal/bigdecimal.c +++ b/ext/bigdecimal/bigdecimal.c @@ -33,6 +33,12 @@ #define BIGDECIMAL_VERSION "4.0.1" +#if SIZEOF_DECDIG == 4 +#define USE_NTT_MULTIPLICATION 1 +#include "ntt.h" +#define NTT_MULTIPLICATION_THRESHOLD 100 +#endif + #define SIGNED_VALUE_MAX INTPTR_MAX #define SIGNED_VALUE_MIN INTPTR_MIN #define MUL_OVERFLOW_SIGNED_VALUE_P(a, b) MUL_OVERFLOW_SIGNED_INTEGER_P(a, b, SIGNED_VALUE_MIN, SIGNED_VALUE_MAX) @@ -3259,6 +3265,25 @@ BigDecimal_vpmult(VALUE self, VALUE v) { RB_GC_GUARD(b.bigdecimal); return c.bigdecimal; } + +#if SIZEOF_DECDIG == 4 +VALUE +BigDecimal_nttmult(VALUE self, VALUE v) { + BDVALUE a,b,c; + a = GetBDValueMust(self); + b = GetBDValueMust(v); + c = NewZeroWrap(1, VPMULT_RESULT_PREC(a.real, b.real) * BASE_FIG); + ntt_multiply(a.real->Prec, b.real->Prec, a.real->frac, b.real->frac, c.real->frac); + VpSetSign(c.real, a.real->sign * b.real->sign); + c.real->exponent = a.real->exponent + b.real->exponent; + c.real->Prec = a.real->Prec + b.real->Prec; + VpNmlz(c.real); + RB_GC_GUARD(a.bigdecimal); + RB_GC_GUARD(b.bigdecimal); + return c.bigdecimal; +} +#endif + #endif /* BIGDECIMAL_USE_VP_TEST_METHODS */ /* Document-class: BigDecimal @@ -3630,6 +3655,9 @@ Init_bigdecimal(void) #ifdef BIGDECIMAL_USE_VP_TEST_METHODS rb_define_method(rb_cBigDecimal, "vpdivd", BigDecimal_vpdivd, 2); rb_define_method(rb_cBigDecimal, "vpmult", BigDecimal_vpmult, 1); +#ifdef USE_NTT_MULTIPLICATION + rb_define_method(rb_cBigDecimal, "nttmult", BigDecimal_nttmult, 1); +#endif #endif /* BIGDECIMAL_USE_VP_TEST_METHODS */ #define ROUNDING_MODE(i, name, value) \ @@ -4912,6 +4940,15 @@ VpMult(Real *c, Real *a, Real *b) c->exponent = a->exponent; /* set exponent */ VpSetSign(c, VpGetSign(a) * VpGetSign(b)); /* set sign */ if (!AddExponent(c, b->exponent)) return 0; + +#ifdef USE_NTT_MULTIPLICATION + if (b->Prec >= NTT_MULTIPLICATION_THRESHOLD) { + ntt_multiply((uint32_t)a->Prec, (uint32_t)b->Prec, a->frac, b->frac, c->frac); + c->Prec = a->Prec + b->Prec; + goto Cleanup; + } +#endif + carry = 0; nc = ind_c = MxIndAB; memset(c->frac, 0, (nc + 1) * sizeof(DECDIG)); /* Initialize c */ @@ -4958,6 +4995,8 @@ VpMult(Real *c, Real *a, Real *b) } } } + +Cleanup: VpNmlz(c); Exit: diff --git a/ext/bigdecimal/ntt.h b/ext/bigdecimal/ntt.h new file mode 100644 index 00000000..941f23f7 --- /dev/null +++ b/ext/bigdecimal/ntt.h @@ -0,0 +1,191 @@ +// NTT (Number Theoretic Transform) implementation for BigDecimal multiplication + +#define NTT_PRIMITIVE_ROOT 17 +#define NTT_PRIME_BASE1 24 +#define NTT_PRIME_BASE2 26 +#define NTT_PRIME_BASE3 29 +#define NTT_PRIME_SHIFT 27 +#define NTT_PRIME1 (((uint32_t)NTT_PRIME_BASE1 << NTT_PRIME_SHIFT) | 1) +#define NTT_PRIME2 (((uint32_t)NTT_PRIME_BASE2 << NTT_PRIME_SHIFT) | 1) +#define NTT_PRIME3 (((uint32_t)NTT_PRIME_BASE3 << NTT_PRIME_SHIFT) | 1) +#define MAX_NTT32_BITS 27 +#define NTT_DECDIG_BASE 1000000000 + +// Calculates base**ex % mod +static uint32_t +mod_pow(uint32_t base, uint32_t ex, uint32_t mod) { + uint32_t res = 1; + uint32_t bit = 1; + while (true) { + if (ex & bit) { + ex ^= bit; + res = ((uint64_t)res * base) % mod; + } + if (!ex) break; + base = ((uint64_t)base * base) % mod; + bit <<= 1; + } + return res; +} + +// Recursively performs butterfly operations of NTT +static void +ntt_recursive(int size_bits, uint32_t *input, uint32_t *output, uint32_t *tmp, int depth, uint32_t r, uint32_t prime) { + if (depth > 0) { + ntt_recursive(size_bits, input, tmp, output, depth - 1, ((uint64_t)r * r) % prime, prime); + } else { + tmp = input; + } + uint32_t size_half = (uint32_t)1 << (size_bits - 1); + uint32_t stride = (uint32_t)1 << (size_bits - depth - 1); + uint32_t n = size_half / stride; + uint32_t rn = 1, rm = prime - 1; + uint32_t idx = 0; + for (uint32_t i = 0; i < n; i++) { + uint32_t j = i * 2 * stride; + for (uint32_t k = 0; k < stride; k++, j++, idx++) { + uint32_t a = tmp[j], b = tmp[j + stride]; + output[idx] = (a + (uint64_t)rn * b) % prime; + output[idx + size_half] = (a + (uint64_t)rm * b) % prime; + } + rn = ((uint64_t)rn * r) % prime; + rm = ((uint64_t)rm * r) % prime; + } +} + +/* Perform NTT on input array. + * base, shift: Represent the prime number as (base << shift | 1) + * r_base: Primitive root of unity modulo prime + * size_bits: log2 of the size of the input array. Should be less or equal to shift + * input: input array of size (1 << size_bits) + */ +static void +ntt(int size_bits, uint32_t *input, uint32_t *output, uint32_t *tmp, int r_base, int base, int shift, int dir) { + uint32_t size = (uint32_t)1 << size_bits; + uint32_t prime = ((uint32_t)base << shift) | 1; + + // rmax**(1 << shift) % prime == 1 + // r**size % prime == 1 + uint32_t rmax = mod_pow(r_base, base, prime); + uint32_t r = mod_pow(rmax, (uint32_t)1 << (shift - size_bits), prime); + + if (dir < 0) r = mod_pow(r, prime - 2, prime); + ntt_recursive(size_bits, input, output, tmp, size_bits - 1, r, prime); + if (dir < 0) { + uint32_t n_inv = mod_pow((uint32_t)size, prime - 2, prime); + for (uint32_t i = 0; i < size; i++) { + output[i] = ((uint64_t)output[i] * n_inv) % prime; + } + } +} + +/* Calculate c that satisfies: c % PRIME1 == mod1 && c % PRIME2 == mod2 && c % PRIME3 == mod3 + * c = (mod1 * 35002755423056150739595925972 + mod2 * 14584479687667766215746868453 + mod3 * 37919651490985126265126719818) % (PRIME1 * PRIME2 * PRIME3) + * Assume c <= 999999999**2*(1<<27) + */ +static inline void +mod_restore_prime_24_26_29_shift_27(uint32_t mod1, uint32_t mod2, uint32_t mod3, uint32_t *digits) { + // Use mixed radix notation to eliminate modulo by PRIME1 * PRIME2 * PRIME3 + // [DIG0, DIG1, DIG2] = DIG0 + DIG1 * PRIME1 + DIG2 * PRIME1 * PRIME2 + // DIG0: 0...PRIME1, DIG1: 0...PRIME2, DIG2: 0...PRIME3 + // 35002755423056150739595925972 = [1, 3489660916, 3113851359] + // 14584479687667766215746868453 = [0, 13, 1297437912] + // 37919651490985126265126719818 = [0, 0, 3373338954] + uint64_t c0 = mod1; + uint64_t c1 = (uint64_t)mod2 * 13 + (uint64_t)mod1 * 3489660916; + uint64_t c2 = (uint64_t)mod3 * 3373338954 % NTT_PRIME3 + (uint64_t)mod2 * 1297437912 % NTT_PRIME3 + (uint64_t)mod1 * 3113851359 % NTT_PRIME3; + c2 += c1 / NTT_PRIME2; + c1 %= NTT_PRIME2; + c2 %= NTT_PRIME3; + // Base conversion. c fits in 3 digits. + c1 += c2 % NTT_DECDIG_BASE * NTT_PRIME2; + c0 += c1 % NTT_DECDIG_BASE * NTT_PRIME1; + c1 /= NTT_DECDIG_BASE; + digits[0] = c0 % NTT_DECDIG_BASE; + c0 /= NTT_DECDIG_BASE; + c1 += c2 / NTT_DECDIG_BASE % NTT_DECDIG_BASE * NTT_PRIME2; + c0 += c1 % NTT_DECDIG_BASE * NTT_PRIME1; + c1 /= NTT_DECDIG_BASE; + digits[1] = c0 % NTT_DECDIG_BASE; + digits[2] = (uint32_t)(c0 / NTT_DECDIG_BASE + c1 % NTT_DECDIG_BASE * NTT_PRIME1); +} + +/* + * NTT multiplication + * Uses three NTTs with mod (24 << 27 | 1), (26 << 27 | 1), and (29 << 27 | 1) + */ +static void +ntt_multiply(size_t a_size, size_t b_size, uint32_t *a, uint32_t *b, uint32_t *c) { + if (a_size < b_size) { + ntt_multiply(b_size, a_size, b, a, c); + return; + } + + int b_bits = 0; + while (((uint32_t)1 << b_bits) < (uint32_t)b_size) b_bits++; + int ntt_size_bits = b_bits + 1; + if (ntt_size_bits > MAX_NTT32_BITS) { + rb_raise(rb_eArgError, "Multiply size too large"); + } + + // To calculate large_a * small_b faster, split into several batches. + uint32_t ntt_size = (uint32_t)1 << ntt_size_bits; + uint32_t batch_size = ntt_size - (uint32_t)b_size; + uint32_t batch_count = (uint32_t)((a_size + batch_size - 1) / batch_size); + + uint32_t *mem = ruby_xcalloc(sizeof(uint32_t), ntt_size * 9); + uint32_t *ntt1 = mem; + uint32_t *ntt2 = mem + ntt_size; + uint32_t *ntt3 = mem + ntt_size * 2; + uint32_t *tmp1 = mem + ntt_size * 3; + uint32_t *tmp2 = mem + ntt_size * 4; + uint32_t *tmp3 = mem + ntt_size * 5; + uint32_t *conv1 = mem + ntt_size * 6; + uint32_t *conv2 = mem + ntt_size * 7; + uint32_t *conv3 = mem + ntt_size * 8; + + // Calculate NTT for b in three primes. Result is reused for each batch of a. + memcpy(tmp1, b, b_size * sizeof(uint32_t)); + memset(tmp1 + b_size, 0, (ntt_size - b_size) * sizeof(uint32_t)); + ntt(ntt_size_bits, tmp1, ntt1, tmp2, NTT_PRIMITIVE_ROOT, NTT_PRIME_BASE1, NTT_PRIME_SHIFT, +1); + ntt(ntt_size_bits, tmp1, ntt2, tmp2, NTT_PRIMITIVE_ROOT, NTT_PRIME_BASE2, NTT_PRIME_SHIFT, +1); + ntt(ntt_size_bits, tmp1, ntt3, tmp2, NTT_PRIMITIVE_ROOT, NTT_PRIME_BASE3, NTT_PRIME_SHIFT, +1); + + memset(c, 0, (a_size + b_size) * sizeof(uint32_t)); + for (uint32_t idx = 0; idx < batch_count; idx++) { + uint32_t len = idx == batch_count - 1 ? (uint32_t)a_size - idx * batch_size : batch_size; + memcpy(tmp1, a + idx * batch_size, len * sizeof(uint32_t)); + memset(tmp1 + len, 0, (ntt_size - len) * sizeof(uint32_t)); + // Calculate convolution for this batch in three primes + ntt(ntt_size_bits, tmp1, tmp2, tmp3, NTT_PRIMITIVE_ROOT, NTT_PRIME_BASE1, NTT_PRIME_SHIFT, +1); + for (uint32_t i = 0; i < ntt_size; i++) tmp2[i] = ((uint64_t)tmp2[i] * ntt1[i]) % NTT_PRIME1; + ntt(ntt_size_bits, tmp2, conv1, tmp3, NTT_PRIMITIVE_ROOT, NTT_PRIME_BASE1, NTT_PRIME_SHIFT, -1); + ntt(ntt_size_bits, tmp1, tmp2, tmp3, NTT_PRIMITIVE_ROOT, NTT_PRIME_BASE2, NTT_PRIME_SHIFT, +1); + for (uint32_t i = 0; i < ntt_size; i++) tmp2[i] = ((uint64_t)tmp2[i] * ntt2[i]) % NTT_PRIME2; + ntt(ntt_size_bits, tmp2, conv2, tmp3, NTT_PRIMITIVE_ROOT, NTT_PRIME_BASE2, NTT_PRIME_SHIFT, -1); + ntt(ntt_size_bits, tmp1, tmp2, tmp3, NTT_PRIMITIVE_ROOT, NTT_PRIME_BASE3, NTT_PRIME_SHIFT, +1); + for (uint32_t i = 0; i < ntt_size; i++) tmp2[i] = ((uint64_t)tmp2[i] * ntt3[i]) % NTT_PRIME3; + ntt(ntt_size_bits, tmp2, conv3, tmp3, NTT_PRIMITIVE_ROOT, NTT_PRIME_BASE3, NTT_PRIME_SHIFT, -1); + + // Restore the original convolution value from three convolutions calculated in three primes. + // Each convolution value is maximum 999999999**2*(1<<27)/2 + for (uint32_t i = 0; i < ntt_size; i++) { + uint32_t dig[3]; + mod_restore_prime_24_26_29_shift_27(conv1[i], conv2[i], conv3[i], dig); + // Maximum values of dig[0], dig[1], and dig[2] are 999999999, 999999999 and 67108863 respectively + // Maximum overlapped sum (considering overlaps between 2 batches) is less than 4134217722 + // so this sum doesn't overflow uint32_t. + for (int j = 0; j < 3; j++) { + // Index check: if dig[j] is non-zero, assign index is within valid range. + if (dig[j]) c[idx * batch_size + i + 1 - j] += dig[j]; + } + } + } + uint32_t carry = 0; + for (int32_t i = (uint32_t)(a_size + b_size - 1); i >= 0; i--) { + uint32_t v = c[i] + carry; + c[i] = v % NTT_DECDIG_BASE; + carry = v / NTT_DECDIG_BASE; + } + ruby_xfree(mem); +} diff --git a/test/bigdecimal/test_vp_operation.rb b/test/bigdecimal/test_vp_operation.rb index b2f1d75a..5cce40ba 100644 --- a/test/bigdecimal/test_vp_operation.rb +++ b/test/bigdecimal/test_vp_operation.rb @@ -13,6 +13,10 @@ def setup end end + def ntt_mult_available? + BASE_FIG == 9 + end + def test_vpmult assert_equal(BigDecimal('121932631112635269'), BigDecimal('123456789').vpmult(BigDecimal('987654321'))) assert_equal(BigDecimal('12193263.1112635269'), BigDecimal('123.456789').vpmult(BigDecimal('98765.4321'))) @@ -21,6 +25,15 @@ def test_vpmult assert_equal(BigDecimal("#{x * y}e-300"), BigDecimal("#{x}e-100").vpmult(BigDecimal("#{y}e-200"))) end + def test_nttmult + omit 'NTT multiplication is only available for 32-bit DECDIG' unless ntt_mult_available? + [*1..32].repeated_permutation(2) do |a, b| + x = BigDecimal(10 ** (BASE_FIG * a) / 7) + y = BigDecimal(10 ** (BASE_FIG * b) / 13) + assert_equal(x.to_i * y.to_i, x.nttmult(y)) + end + end + def test_vpdivd # a[0] > b[0] # XXXX_YYYY_ZZZZ / 1111 #=> 000X_000Y_000Z From 1025a0e6ab41d7fd3595ceb567132346bdd04926 Mon Sep 17 00:00:00 2001 From: tompng Date: Tue, 19 Aug 2025 01:19:57 +0900 Subject: [PATCH 2/4] Implement Newton-Raphson division Improve performance of huge divisions --- bigdecimal.gemspec | 1 + ext/bigdecimal/bigdecimal.c | 47 +++++-- ext/bigdecimal/bigdecimal.h | 26 ++++ ext/bigdecimal/div.h | 192 +++++++++++++++++++++++++++ test/bigdecimal/test_vp_operation.rb | 107 +++++++++++---- 5 files changed, 334 insertions(+), 39 deletions(-) create mode 100644 ext/bigdecimal/div.h diff --git a/bigdecimal.gemspec b/bigdecimal.gemspec index 2c1550cd..6b20ac08 100644 --- a/bigdecimal.gemspec +++ b/bigdecimal.gemspec @@ -43,6 +43,7 @@ Gem::Specification.new do |s| ext/bigdecimal/bigdecimal.c ext/bigdecimal/bigdecimal.h ext/bigdecimal/bits.h + ext/bigdecimal/div.h ext/bigdecimal/feature.h ext/bigdecimal/missing.c ext/bigdecimal/missing.h diff --git a/ext/bigdecimal/bigdecimal.c b/ext/bigdecimal/bigdecimal.c index 21dde775..baf34fdf 100644 --- a/ext/bigdecimal/bigdecimal.c +++ b/ext/bigdecimal/bigdecimal.c @@ -29,6 +29,7 @@ #endif #include "bits.h" +#include "div.h" #include "static_assert.h" #define BIGDECIMAL_VERSION "4.0.1" @@ -37,6 +38,7 @@ #define USE_NTT_MULTIPLICATION 1 #include "ntt.h" #define NTT_MULTIPLICATION_THRESHOLD 100 +#define NEWTON_RAPHSON_DIVISION_THRESHOLD 200 #endif #define SIGNED_VALUE_MAX INTPTR_MAX @@ -79,11 +81,6 @@ static struct { uint8_t mode; } rbd_rounding_modes[RBD_NUM_ROUNDING_MODES]; -typedef struct { - VALUE bigdecimal; - Real *real; -} BDVALUE; - typedef struct { VALUE bigdecimal_or_nil; Real *real_or_null; @@ -211,7 +208,6 @@ rbd_allocate_struct_zero(int sign, size_t const digits) static unsigned short VpGetException(void); static void VpSetException(unsigned short f); static void VpCheckException(Real *p, bool always); -static int AddExponent(Real *a, SIGNED_VALUE n); static VALUE CheckGetValue(BDVALUE v); static void VpInternalRound(Real *c, size_t ixDigit, DECDIG vPrev, DECDIG v); static int VpLimitRound(Real *c, size_t ixDigit); @@ -1077,9 +1073,6 @@ BigDecimal_check_num(Real *p) VpCheckException(p, true); } -static VALUE BigDecimal_fix(VALUE self); -static VALUE BigDecimal_split(VALUE self); - /* Returns the value as an Integer. * * If the BigDecimal is infinity or NaN, raises FloatDomainError. @@ -3241,19 +3234,39 @@ BigDecimal_literal(const char *str) #ifdef BIGDECIMAL_USE_VP_TEST_METHODS VALUE -BigDecimal_vpdivd(VALUE self, VALUE r, VALUE cprec) { - BDVALUE a,b,c,d; +BigDecimal_vpdivd_generic(VALUE self, VALUE r, VALUE cprec, void (*vpdivd_func)(Real*, Real*, Real*, Real*)) { + BDVALUE a, b, c, d; size_t cn = NUM2INT(cprec); a = GetBDValueMust(self); b = GetBDValueMust(r); c = NewZeroWrap(1, cn * BASE_FIG); d = NewZeroWrap(1, VPDIVD_REM_PREC(a.real, b.real, c.real) * BASE_FIG); - VpDivd(c.real, d.real, a.real, b.real); + vpdivd_func(c.real, d.real, a.real, b.real); RB_GC_GUARD(a.bigdecimal); RB_GC_GUARD(b.bigdecimal); return rb_assoc_new(c.bigdecimal, d.bigdecimal); } +void +VpDivdNormal(Real *c, Real *r, Real *a, Real *b) { + VpDivd(c, r, a, b); +} + +VALUE +BigDecimal_vpdivd(VALUE self, VALUE r, VALUE cprec) { + return BigDecimal_vpdivd_generic(self, r, cprec, VpDivdNormal); +} + +VALUE +BigDecimal_vpdivd_newton(VALUE self, VALUE r, VALUE cprec) { + return BigDecimal_vpdivd_generic(self, r, cprec, VpDivdNewton); +} + +VALUE +BigDecimal_newton_raphson_inverse(VALUE self, VALUE prec) { + return newton_raphson_inverse(self, NUM2SIZET(prec)); +} + VALUE BigDecimal_vpmult(VALUE self, VALUE v) { BDVALUE a,b,c; @@ -3654,6 +3667,8 @@ Init_bigdecimal(void) #ifdef BIGDECIMAL_USE_VP_TEST_METHODS rb_define_method(rb_cBigDecimal, "vpdivd", BigDecimal_vpdivd, 2); + rb_define_method(rb_cBigDecimal, "vpdivd_newton", BigDecimal_vpdivd_newton, 2); + rb_define_method(rb_cBigDecimal, "newton_raphson_inverse", BigDecimal_newton_raphson_inverse, 1); rb_define_method(rb_cBigDecimal, "vpmult", BigDecimal_vpmult, 1); #ifdef USE_NTT_MULTIPLICATION rb_define_method(rb_cBigDecimal, "nttmult", BigDecimal_nttmult, 1); @@ -5044,6 +5059,14 @@ VpDivd(Real *c, Real *r, Real *a, Real *b) if (word_a > word_r || word_b + word_c - 2 >= word_r) goto space_error; +#ifdef USE_NTT_MULTIPLICATION + // Newton-Raphson division requires multiplication to be faster than O(n^2) + if (word_c >= NEWTON_RAPHSON_DIVISION_THRESHOLD && word_b >= NEWTON_RAPHSON_DIVISION_THRESHOLD) { + VpDivdNewton(c, r, a, b); + goto Exit; + } +#endif + for (i = 0; i < word_a; ++i) r->frac[i] = a->frac[i]; for (i = word_a; i < word_r; ++i) r->frac[i] = 0; for (i = 0; i < word_c; ++i) c->frac[i] = 0; diff --git a/ext/bigdecimal/bigdecimal.h b/ext/bigdecimal/bigdecimal.h index 82c88a2a..71ddb21f 100644 --- a/ext/bigdecimal/bigdecimal.h +++ b/ext/bigdecimal/bigdecimal.h @@ -188,6 +188,11 @@ typedef struct { DECDIG frac[FLEXIBLE_ARRAY_SIZE]; /* Array of fraction part. */ } Real; +typedef struct { + VALUE bigdecimal; + Real *real; +} BDVALUE; + /* * ------------------ * EXPORTables. @@ -232,10 +237,31 @@ VP_EXPORT int VpActiveRound(Real *y, Real *x, unsigned short f, ssize_t il); VP_EXPORT int VpMidRound(Real *y, unsigned short f, ssize_t nf); VP_EXPORT int VpLeftRound(Real *y, unsigned short f, ssize_t nf); VP_EXPORT void VpFrac(Real *y, Real *x); +VP_EXPORT int AddExponent(Real *a, SIGNED_VALUE n); /* VP constants */ VP_EXPORT Real *VpOne(void); +/* + * **** BigDecimal part **** + */ +VP_EXPORT VALUE BigDecimal_lt(VALUE self, VALUE r); +VP_EXPORT VALUE BigDecimal_ge(VALUE self, VALUE r); +VP_EXPORT VALUE BigDecimal_exponent(VALUE self); +VP_EXPORT VALUE BigDecimal_fix(VALUE self); +VP_EXPORT VALUE BigDecimal_frac(VALUE self); +VP_EXPORT VALUE BigDecimal_add(VALUE self, VALUE b); +VP_EXPORT VALUE BigDecimal_sub(VALUE self, VALUE b); +VP_EXPORT VALUE BigDecimal_mult(VALUE self, VALUE b); +VP_EXPORT VALUE BigDecimal_add2(VALUE self, VALUE b, VALUE n); +VP_EXPORT VALUE BigDecimal_sub2(VALUE self, VALUE b, VALUE n); +VP_EXPORT VALUE BigDecimal_mult2(VALUE self, VALUE b, VALUE n); +VP_EXPORT VALUE BigDecimal_split(VALUE self); +VP_EXPORT VALUE BigDecimal_decimal_shift(VALUE self, VALUE v); +VP_EXPORT inline BDVALUE GetBDValueMust(VALUE v); +VP_EXPORT inline BDVALUE rbd_allocate_struct_zero_wrap(int sign, size_t const digits); +#define NewZeroWrap rbd_allocate_struct_zero_wrap + /* * ------------------ * MACRO definitions. diff --git a/ext/bigdecimal/div.h b/ext/bigdecimal/div.h new file mode 100644 index 00000000..e6dd89c9 --- /dev/null +++ b/ext/bigdecimal/div.h @@ -0,0 +1,192 @@ +// Calculate the inverse of x using the Newton-Raphson method. +static VALUE +newton_raphson_inverse(VALUE x, size_t prec) { + BDVALUE bdone = NewZeroWrap(1, 1); + VpSetOne(bdone.real); + VALUE one = bdone.bigdecimal; + + // Initial approximation in 2 digits + BDVALUE bdx = GetBDValueMust(x); + BDVALUE inv0 = NewZeroWrap(1, 2 * BIGDECIMAL_COMPONENT_FIGURES); + VpSetOne(inv0.real); + DECDIG_DBL numerator = (DECDIG_DBL)BIGDECIMAL_BASE * 100; + DECDIG_DBL denominator = (DECDIG_DBL)bdx.real->frac[0] * 100 + (DECDIG_DBL)(bdx.real->Prec >= 2 ? bdx.real->frac[1] : 0) * 100 / BIGDECIMAL_BASE; + inv0.real->frac[0] = (DECDIG)(numerator / denominator); + inv0.real->frac[1] = (DECDIG)((numerator % denominator) * (BIGDECIMAL_BASE / 100) / denominator * 100); + inv0.real->Prec = 2; + inv0.real->exponent = 1 - bdx.real->exponent; + VpNmlz(inv0.real); + RB_GC_GUARD(bdx.bigdecimal); + VALUE inv = inv0.bigdecimal; + + int bl = 1; + while (((size_t)1 << bl) < prec) bl++; + + for (int i = bl; i >= 0; i--) { + size_t n = (prec >> i) + 2; + if (n > prec) n = prec; + // Newton-Raphson iteration: inv_next = inv + inv * (1 - x * inv) + VALUE one_minus_x_inv = BigDecimal_sub2( + one, + BigDecimal_mult(BigDecimal_mult2(x, one, SIZET2NUM(n + 1)), inv), + SIZET2NUM(SIZET2NUM(n / 2)) + ); + inv = BigDecimal_add2( + inv, + BigDecimal_mult(inv, one_minus_x_inv), + SIZET2NUM(n) + ); + } + return inv; +} + +// Calculates divmod by multiplying approximate reciprocal of y +static void +divmod_by_inv_mul(VALUE x, VALUE y, VALUE inv, VALUE *res_div, VALUE *res_mod) { + VALUE div = BigDecimal_fix(BigDecimal_mult(x, inv)); + VALUE mod = BigDecimal_sub(x, BigDecimal_mult(div, y)); + while (RTEST(BigDecimal_lt(mod, INT2FIX(0)))) { + mod = BigDecimal_add(mod, y); + div = BigDecimal_sub(div, INT2FIX(1)); + } + while (RTEST(BigDecimal_ge(mod, y))) { + mod = BigDecimal_sub(mod, y); + div = BigDecimal_add(div, INT2FIX(1)); + } + *res_div = div; + *res_mod = mod; +} + +static void +slice_copy(DECDIG *dest, Real *src, size_t rshift, size_t length) { + ssize_t start = src->exponent - rshift - length; + if (start >= (ssize_t)src->Prec) return; + if (start < 0) { + dest -= start; + length += start; + start = 0; + } + size_t max_length = src->Prec - start; + memcpy(dest, src->frac + start, Min(length, max_length) * sizeof(DECDIG)); +} + +/* Calculates divmod using Newton-Raphson method. + * x and y must be a BigDecimal representing an integer value. + * + * To calculate with low cost, we need to split x into blocks and perform divmod for each block. + * x_digits = remaining_digits(<= y_digits) + block_digits * num_blocks + * + * Example: + * xxx_xxxxx_xxxxx_xxxxx(18 digits) / yyyyy(5 digits) + * remaining_digits = 3, block_digits = 5, num_blocks = 3 + * repeating xxxxx_xxxxxx.divmod(yyyyy) calculation 3 times. + * + * In each divmod step, dividend is at most (y_digits + block_digits) digits and divisor is y_digits digits. + * Reciprocal of y needs block_digits + 1 precision. + */ +static void +divmod_newton(VALUE x, VALUE y, VALUE *div_out, VALUE *mod_out) { + size_t x_digits = NUM2SIZET(BigDecimal_exponent(x)); + size_t y_digits = NUM2SIZET(BigDecimal_exponent(y)); + if (x_digits <= y_digits) x_digits = y_digits + 1; + + size_t n = x_digits / y_digits; + size_t block_figs = (x_digits - y_digits) / n / BIGDECIMAL_COMPONENT_FIGURES + 1; + size_t block_digits = block_figs * BIGDECIMAL_COMPONENT_FIGURES; + size_t num_blocks = (x_digits - y_digits + block_digits - 1) / block_digits; + size_t y_figs = (y_digits - 1) / BIGDECIMAL_COMPONENT_FIGURES + 1; + VALUE yinv = newton_raphson_inverse(y, block_digits + 1); + + BDVALUE divident = NewZeroWrap(1, BIGDECIMAL_COMPONENT_FIGURES * (y_figs + block_figs)); + BDVALUE div_result = NewZeroWrap(1, BIGDECIMAL_COMPONENT_FIGURES * (num_blocks * block_figs + 1)); + BDVALUE bdx = GetBDValueMust(x); + + VALUE mod = BigDecimal_fix(BigDecimal_decimal_shift(x, SSIZET2NUM(-num_blocks * block_digits))); + for (ssize_t i = num_blocks - 1; i >= 0; i--) { + memset(divident.real->frac, 0, (y_figs + block_figs) * sizeof(DECDIG)); + + BDVALUE bdmod = GetBDValueMust(mod); + slice_copy(divident.real->frac, bdmod.real, 0, y_figs); + slice_copy(divident.real->frac + y_figs, bdx.real, i * block_figs, block_figs); + RB_GC_GUARD(bdmod.bigdecimal); + + VpSetSign(divident.real, 1); + divident.real->exponent = y_figs + block_figs; + divident.real->Prec = y_figs + block_figs; + VpNmlz(divident.real); + + VALUE div; + divmod_by_inv_mul(divident.bigdecimal, y, yinv, &div, &mod); + BDVALUE bddiv = GetBDValueMust(div); + slice_copy(div_result.real->frac + (num_blocks - i - 1) * block_figs, bddiv.real, 0, block_figs + 1); + RB_GC_GUARD(bddiv.bigdecimal); + } + VpSetSign(div_result.real, 1); + div_result.real->exponent = num_blocks * block_figs + 1; + div_result.real->Prec = num_blocks * block_figs + 1; + VpNmlz(div_result.real); + RB_GC_GUARD(bdx.bigdecimal); + RB_GC_GUARD(divident.bigdecimal); + RB_GC_GUARD(div_result.bigdecimal); + *div_out = div_result.bigdecimal; + *mod_out = mod; +} + +static VALUE +VpDivdNewtonInner(VALUE args_ptr) +{ + Real **args = (Real**)args_ptr; + Real *c = args[0], *r = args[1], *a = args[2], *b = args[3]; + BDVALUE a2, b2, c2, r2; + VALUE div, mod, a2_frac = Qnil; + size_t div_prec = c->MaxPrec - 1; + size_t base_prec = b->Prec; + + a2 = NewZeroWrap(1, a->Prec * BIGDECIMAL_COMPONENT_FIGURES); + b2 = NewZeroWrap(1, b->Prec * BIGDECIMAL_COMPONENT_FIGURES); + VpAsgn(a2.real, a, 1); + VpAsgn(b2.real, b, 1); + VpSetSign(a2.real, 1); + VpSetSign(b2.real, 1); + a2.real->exponent = base_prec + div_prec; + b2.real->exponent = base_prec; + + if ((ssize_t)a2.real->Prec > a2.real->exponent) { + a2_frac = BigDecimal_frac(a2.bigdecimal); + VpMidRound(a2.real, VP_ROUND_DOWN, 0); + } + divmod_newton(a2.bigdecimal, b2.bigdecimal, &div, &mod); + if (a2_frac != Qnil) mod = BigDecimal_add(mod, a2_frac); + + c2 = GetBDValueMust(div); + r2 = GetBDValueMust(mod); + VpAsgn(c, c2.real, VpGetSign(a) * VpGetSign(b)); + VpAsgn(r, r2.real, VpGetSign(a)); + AddExponent(c, a->exponent); + AddExponent(c, -b->exponent); + AddExponent(c, -div_prec); + AddExponent(r, a->exponent); + AddExponent(r, -base_prec - div_prec); + RB_GC_GUARD(a2.bigdecimal); + RB_GC_GUARD(a2.bigdecimal); + RB_GC_GUARD(c2.bigdecimal); + RB_GC_GUARD(r2.bigdecimal); + return Qnil; +} + +static VALUE +ensure_restore_prec_limit(VALUE limit) +{ + VpSetPrecLimit(NUM2SIZET(limit)); + return Qnil; +} + +static void +VpDivdNewton(Real *c, Real *r, Real *a, Real *b) +{ + Real *args[4] = {c, r, a, b}; + size_t pl = VpGetPrecLimit(); + VpSetPrecLimit(0); + // Ensure restoring prec limit because some methods used in VpDivdNewtonInner may raise an exception + rb_ensure(VpDivdNewtonInner, (VALUE)args, ensure_restore_prec_limit, SIZET2NUM(pl)); +} diff --git a/test/bigdecimal/test_vp_operation.rb b/test/bigdecimal/test_vp_operation.rb index 5cce40ba..3d527edb 100644 --- a/test/bigdecimal/test_vp_operation.rb +++ b/test/bigdecimal/test_vp_operation.rb @@ -34,6 +34,59 @@ def test_nttmult end end + def test_newton_inverse + xs = [BigDecimal(3), BigDecimal('123e50'), BigDecimal('13' * 44), BigDecimal('17' * 45), BigDecimal('19' * 46)] + %i[up half_up down].each do |rounding_mode| + BigDecimal.save_rounding_mode do + BigDecimal.mode(BigDecimal::ROUND_MODE, rounding_mode) + [*1..32, 50, 100, 200, 300].each do |prec| + xs.each do |x| + inv = x.newton_raphson_inverse(prec) + assert_in_delta(1, x * inv, BigDecimal("1e#{1 - prec}")) + + high_precision_inv = inv * (2 - x * inv) + expected_inv = high_precision_inv.mult(1, prec) + last_digit = BigDecimal("1e#{expected_inv.exponent - prec}") + assert_include([expected_inv - last_digit, expected_inv, expected_inv + last_digit], inv) + end + end + end + end + end + + def test_not_affected_by_limit + x_int = 123**135 + y_int = 135**123 + xy_int = x_int * y_int + mod_int = 111**111 + x = BigDecimal(x_int) + y = BigDecimal(y_int) + xy = BigDecimal(xy_int) + mod = BigDecimal(mod_int) + z = BigDecimal(xy_int + mod_int) + BigDecimal.save_limit do + BigDecimal.limit 3 + assert_equal(xy, x.vpmult(y)) + assert_equal(3, BigDecimal.limit) + if ntt_mult_available? + assert_equal(xy, x.nttmult(y)) + assert_equal(3, BigDecimal.limit) + end + + prec = (z.exponent - 1) / BASE_FIG - (y.exponent - 1) / BASE_FIG + 1 + assert_equal([x, mod], z.vpdivd(y, prec)) + assert_equal(3, BigDecimal.limit) + assert_equal([x, mod], z.vpdivd_newton(y, prec)) + assert_equal(3, BigDecimal.limit) + end + end + + def assert_vpdivd_equal(expected_divmod, x_y_n) + x, *args = x_y_n + assert_equal(expected_divmod, x.vpdivd(*args)) + assert_equal(expected_divmod, x.vpdivd_newton(*args)) + end + def test_vpdivd # a[0] > b[0] # XXXX_YYYY_ZZZZ / 1111 #=> 000X_000Y_000Z @@ -44,11 +97,11 @@ def test_vpdivd d3 = BigDecimal("4e#{BASE_FIG * 2}") + d2 d4 = BigDecimal("5e#{BASE_FIG}") + d3 d5 = BigDecimal(6) + d4 - assert_equal([d1, x1 - d1 * y], x1.vpdivd(y, 1)) - assert_equal([d2, x1 - d2 * y], x1.vpdivd(y, 2)) - assert_equal([d3, x1 - d3 * y], x1.vpdivd(y, 3)) - assert_equal([d4, x1 - d4 * y], x1.vpdivd(y, 4)) - assert_equal([d5, x1 - d5 * y], x1.vpdivd(y, 5)) + assert_vpdivd_equal([d1, x1 - d1 * y], [x1, y, 1]) + assert_vpdivd_equal([d2, x1 - d2 * y], [x1, y, 2]) + assert_vpdivd_equal([d3, x1 - d3 * y], [x1, y, 3]) + assert_vpdivd_equal([d4, x1 - d4 * y], [x1, y, 4]) + assert_vpdivd_equal([d5, x1 - d5 * y], [x1, y, 5]) # a[0] < b[0] # 00XX_XXYY_YYZZ_ZZ00 / 1111 #=> 0000_0X00_0Y00_0Z00 @@ -59,28 +112,28 @@ def test_vpdivd d3 = BigDecimal("4e#{2 * BASE_FIG + shift}") + d2 d4 = BigDecimal("5e#{BASE_FIG + shift}") + d3 d5 = BigDecimal("6e#{shift}") + d4 - assert_equal([0, x2], x2.vpdivd(y, 1)) - assert_equal([d1, x2 - d1 * y], x2.vpdivd(y, 2)) - assert_equal([d2, x2 - d2 * y], x2.vpdivd(y, 3)) - assert_equal([d3, x2 - d3 * y], x2.vpdivd(y, 4)) - assert_equal([d4, x2 - d4 * y], x2.vpdivd(y, 5)) - assert_equal([d5, x2 - d5 * y], x2.vpdivd(y, 6)) + assert_vpdivd_equal([0, x2], [x2, y, 1]) + assert_vpdivd_equal([d1, x2 - d1 * y], [x2, y, 2]) + assert_vpdivd_equal([d2, x2 - d2 * y], [x2, y, 3]) + assert_vpdivd_equal([d3, x2 - d3 * y], [x2, y, 4]) + assert_vpdivd_equal([d4, x2 - d4 * y], [x2, y, 5]) + assert_vpdivd_equal([d5, x2 - d5 * y], [x2, y, 6]) end def test_vpdivd_large_quotient_prec # 0001 / 0003 = 0000_3333_3333 - assert_equal([BigDecimal('0.' + '3' * BASE_FIG * 9), BigDecimal("1e-#{9 * BASE_FIG}")], BigDecimal(1).vpdivd(BigDecimal(3), 10)) + assert_vpdivd_equal([BigDecimal('0.' + '3' * BASE_FIG * 9), BigDecimal("1e-#{9 * BASE_FIG}")], [BigDecimal(1), BigDecimal(3), 10]) # 1000 / 0003 = 0333_3333_3333 - assert_equal([BigDecimal('3' * (BASE_FIG - 1) + '.' + '3' * BASE_FIG * 9), BigDecimal("1e-#{9 * BASE_FIG}")], BigDecimal(BASE / 10).vpdivd(BigDecimal(3), 10)) + assert_vpdivd_equal([BigDecimal('3' * (BASE_FIG - 1) + '.' + '3' * BASE_FIG * 9), BigDecimal("1e-#{9 * BASE_FIG}")], [BigDecimal(BASE / 10), BigDecimal(3), 10]) end def test_vpdivd_with_one x = BigDecimal('1234.2468000001234') - assert_equal([BigDecimal('1234'), BigDecimal('0.2468000001234')], x.vpdivd(BigDecimal(1), 1)) - assert_equal([BigDecimal('+1234.2468'), BigDecimal('+0.1234e-9')], (+x).vpdivd(BigDecimal(+1), 2)) - assert_equal([BigDecimal('-1234.2468'), BigDecimal('+0.1234e-9')], (+x).vpdivd(BigDecimal(-1), 2)) - assert_equal([BigDecimal('-1234.2468'), BigDecimal('-0.1234e-9')], (-x).vpdivd(BigDecimal(+1), 2)) - assert_equal([BigDecimal('+1234.2468'), BigDecimal('-0.1234e-9')], (-x).vpdivd(BigDecimal(-1), 2)) + assert_vpdivd_equal([BigDecimal('1234'), BigDecimal('0.2468000001234')], [x, BigDecimal(1), 1]) + assert_vpdivd_equal([BigDecimal('+1234.2468'), BigDecimal('+0.1234e-9')], [+x, BigDecimal(+1), 2]) + assert_vpdivd_equal([BigDecimal('-1234.2468'), BigDecimal('+0.1234e-9')], [+x, BigDecimal(-1), 2]) + assert_vpdivd_equal([BigDecimal('-1234.2468'), BigDecimal('-0.1234e-9')], [-x, BigDecimal(+1), 2]) + assert_vpdivd_equal([BigDecimal('+1234.2468'), BigDecimal('-0.1234e-9')], [-x, BigDecimal(-1), 2]) end def test_vpdivd_precisions @@ -92,7 +145,7 @@ def test_vpdivd_precisions yn = (y.digits.size + BASE_FIG - 1) / BASE_FIG base = BASE ** (n - xn + yn - 1) div = BigDecimal((x * base / y).to_i) / base - assert_equal([div, x - y * div], BigDecimal(x).vpdivd(y, n)) + assert_vpdivd_equal([div, x - y * div], [BigDecimal(x), BigDecimal(y), n]) end end end @@ -105,7 +158,7 @@ def test_vpdivd_borrow x = y * (3 * BASE**4 + a * BASE**3 + b * BASE**2 + c * BASE + d) / BASE div = BigDecimal(x * BASE / y) / BASE mod = BigDecimal(x) - div * y - assert_equal([div, mod], BigDecimal(x).vpdivd(BigDecimal(y), 5)) + assert_vpdivd_equal([div, mod], [BigDecimal(x), BigDecimal(y), 5]) end end end @@ -117,22 +170,22 @@ def test_vpdivd_large_prec_divisor divy1_1 = BigDecimal(2) divy2_1 = BigDecimal(1) divy2_2 = BigDecimal('1.' + '9' * BASE_FIG) - assert_equal([divy1_1, x - y1 * divy1_1], x.vpdivd(y1, 1)) - assert_equal([divy2_1, x - y2 * divy2_1], x.vpdivd(y2, 1)) - assert_equal([divy2_2, x - y2 * divy2_2], x.vpdivd(y2, 2)) + assert_vpdivd_equal([divy1_1, x - y1 * divy1_1], [x, y1, 1]) + assert_vpdivd_equal([divy2_1, x - y2 * divy2_1], [x, y2, 1]) + assert_vpdivd_equal([divy2_2, x - y2 * divy2_2], [x, y2, 2]) end def test_vpdivd_intermediate_zero if BASE_FIG == 9 x = BigDecimal('123456789.246913578000000000123456789') y = BigDecimal('123456789') - assert_equal([BigDecimal('1.000000002000000000000000001'), BigDecimal(0)], x.vpdivd(y, 4)) - assert_equal([BigDecimal('1.000000000049999999'), BigDecimal('1e-18')], BigDecimal("2.000000000099999999").vpdivd(2, 3)) + assert_vpdivd_equal([BigDecimal('1.000000002000000000000000001'), BigDecimal(0)], [x, y, 4]) + assert_vpdivd_equal([BigDecimal('1.000000000049999999'), BigDecimal('1e-18')], [BigDecimal("2.000000000099999999"), 2, 3]) else x = BigDecimal('1234.246800001234') y = BigDecimal('1234') - assert_equal([BigDecimal('1.000200000001'), BigDecimal(0)], x.vpdivd(y, 4)) - assert_equal([BigDecimal('1.00000499'), BigDecimal('1e-8')], BigDecimal("2.00000999").vpdivd(2, 3)) + assert_vpdivd_equal([BigDecimal('1.000200000001'), BigDecimal(0)], [x, y, 4]) + assert_vpdivd_equal([BigDecimal('1.00000499'), BigDecimal('1e-8')], [BigDecimal("2.00000999"), 2, 3]) end end end From 7ee8b060ecf8874b099a06f15231d7475031287c Mon Sep 17 00:00:00 2001 From: tompng Date: Tue, 16 Sep 2025 01:42:07 +0900 Subject: [PATCH 3/4] Improve taylor series calculation of exp and sin by bit burst algorithm exp and sin becomes orders of magnitude faster. To make log and atan also fast, log and atan now depends on exp and sin. log(x): solve exp(y)-x=0 by Newton's method atan(x): solve tan(y)-x=0 by Newton's method --- lib/bigdecimal.rb | 133 ++++++++++++++++++-------------- lib/bigdecimal/math.rb | 92 ++++++++++++---------- test/bigdecimal/test_bigmath.rb | 34 ++++---- 3 files changed, 146 insertions(+), 113 deletions(-) diff --git a/lib/bigdecimal.rb b/lib/bigdecimal.rb index 12250ce9..998087d8 100644 --- a/lib/bigdecimal.rb +++ b/lib/bigdecimal.rb @@ -60,6 +60,46 @@ def self.nan_computation_result # :nodoc: end BigDecimal::NAN end + + # Iteration for Newton's method with increasing precision + def self.newton_loop(prec, initial_precision: BigDecimal.double_fig / 2, safe_margin: 2) # :nodoc: + precs = [] + while prec > initial_precision + precs << prec + prec = (precs.last + 1) / 2 + safe_margin + end + precs.reverse_each do |p| + yield p + end + end + + # Calculates Math.log(x.to_f) considering large or small exponent + def self.float_log(x) # :nodoc: + Math.log(x._decimal_shift(-x.exponent).to_f) + x.exponent * Math.log(10) + end + + # Calculating Taylor series sum using binary splitting method + # Calculates f(x) = (x/d0)*(1+(x/d1)*(1+(x/d2)*(1+(x/d3)*(1+...)))) + # x.n_significant_digits or ds.size must be small to be performant. + def self.taylor_sum_binary_splitting(x, ds, prec) # :nodoc: + fs = ds.map {|d| [0, BigDecimal(d)] } + # fs = [[a0, a1], [b0, b1], [c0, c1], ...] + # f(x) = a0/a1+(x/a1)*(1+b0/b1+(x/b1)*(1+c0/c1+(x/c1)*(1+d0/d1+(x/d1)*(1+...)))) + while fs.size > 1 + # Merge two adjacent fractions + # from: (1 + a0/a1 + x/a1 * (1 + b0/b1 + x/b1 * rest)) + # to: (1 + (a0*b1+x*(b0+b1))/(a1*b1) + (x*x)/(a1*b1) * rest) + xn = xn ? xn.mult(xn, prec) : x + fs = fs.each_slice(2).map do |(a, b)| + b ||= [0, BigDecimal(1)._decimal_shift([xn.exponent, 0].max + 2)] + [ + (a[0] * b[1]).add(xn * (b[0] + b[1]), prec), + a[1].mult(b[1], prec) + ] + end + end + BigDecimal(fs[0][0]).div(fs[0][1], prec) + end end # call-seq: @@ -226,9 +266,7 @@ def sqrt(prec) ex = exponent / 2 x = _decimal_shift(-2 * ex) y = BigDecimal(Math.sqrt(x.to_f), 0) - precs = [prec + BigDecimal.double_fig] - precs << 2 + precs.last / 2 while precs.last > BigDecimal.double_fig - precs.reverse_each do |p| + Internal.newton_loop(prec + BigDecimal.double_fig) do |p| y = y.add(x.div(y, p), p).div(2, p) end y._decimal_shift(ex).mult(1, prec) @@ -264,59 +302,32 @@ def log(x, prec) return BigDecimal(0) if x == 1 prec2 = prec + BigDecimal.double_fig - BigDecimal.save_limit do - BigDecimal.limit(0) - if x > 10 || x < 0.1 - log10 = log(BigDecimal(10), prec2) - exponent = x.exponent - x = x._decimal_shift(-exponent) - if x < 0.3 - x *= 10 - exponent -= 1 - end - return (log10 * exponent).add(log(x, prec2), prec) - end - - x_minus_one_exponent = (x - 1).exponent - # log(x) = log(sqrt(sqrt(sqrt(sqrt(x))))) * 2**sqrt_steps - sqrt_steps = [Integer.sqrt(prec2) + 3 * x_minus_one_exponent, 0].max - - lg2 = 0.3010299956639812 - sqrt_prec = prec2 + [-x_minus_one_exponent, 0].max + (sqrt_steps * lg2).ceil - - sqrt_steps.times do - x = x.sqrt(sqrt_prec) - end - - # Taylor series for log(x) around 1 - # log(x) = -log((1 + X) / (1 - X)) where X = (x - 1) / (x + 1) - # log(x) = 2 * (X + X**3 / 3 + X**5 / 5 + X**7 / 7 + ...) - x = (x - 1).div(x + 1, sqrt_prec) - y = x - x2 = x.mult(x, prec2) - 1.step do |i| - n = prec2 + x.exponent - y.exponent + x2.exponent - break if n <= 0 || x.zero? - x = x.mult(x2.round(n - x2.exponent), n) - y = y.add(x.div(2 * i + 1, n), prec2) - end + if x < 0.1 || x > 10 + exponent = (3 * x).exponent - 1 + x = x._decimal_shift(-exponent) + return log(10, prec2).mult(exponent, prec2).add(log(x, prec2), prec) + end - y.mult(2 ** (sqrt_steps + 1), prec) + # Solve exp(y) - x = 0 with Newton's method + # Repeat: y -= (exp(y) - x) / exp(y) + y = BigDecimal(BigDecimal::Internal.float_log(x), 0) + exp_additional_prec = [-(x - 1).exponent, 0].max + BigDecimal::Internal.newton_loop(prec2) do |p| + expy = exp(y, p + exp_additional_prec) + y = y.sub(expy.sub(x, p).div(expy, p), p) end + y.mult(1, prec) end - # Taylor series for exp(x) around 0 - private_class_method def _exp_taylor(x, prec) # :nodoc: - xn = BigDecimal(1) - y = BigDecimal(1) - 1.step do |i| - n = prec + xn.exponent - break if n <= 0 || xn.zero? - xn = xn.mult(x, n).div(i, n) - y = y.add(xn, prec) - end - y + private_class_method def _exp_binary_splitting(x, prec) # :nodoc: + return BigDecimal(1) if x.zero? + # Find k that satisfies x**k / k! < 10**(-prec) + log10 = Math.log(10) + logx = BigDecimal::Internal.float_log(x.abs) + step = (1..).bsearch { |k| Math.lgamma(k + 1)[0] - k * logx > prec * log10 } + # exp(x)-1 = x*(1+x/2*(1+x/3*(1+x/4*(1+x/5*(1+...))))) + 1 + BigDecimal::Internal.taylor_sum_binary_splitting(x, [*1..step], prec) end # call-seq: @@ -341,11 +352,21 @@ def exp(x, prec) prec2 = prec + BigDecimal.double_fig + cnt x = x._decimal_shift(-cnt) - # Calculation of exp(small_prec) is fast because calculation of x**n is fast - # Calculation of exp(small_abs) converges fast. - # exp(x) = exp(small_prec_part + small_abs_part) = exp(small_prec_part) * exp(small_abs_part) - x_small_prec = x.round(Integer.sqrt(prec2)) - y = _exp_taylor(x_small_prec, prec2).mult(_exp_taylor(x.sub(x_small_prec, prec2), prec2), prec2) + # Decimal form of bit-burst algorithm + # Calculate exp(x.xxxxxxxxxxxxxxxx) as + # exp(x.xx) * exp(0.00xx) * exp(0.0000xxxx) * exp(0.00000000xxxxxxxx) + x = x.mult(1, prec2) + n = 2 + y = BigDecimal(1) + BigDecimal.save_limit do + BigDecimal.limit(0) + while x != 0 do + partial_x = x.truncate(n) + x -= partial_x + y = y.mult(_exp_binary_splitting(partial_x, prec2), prec2) + n *= 2 + end + end # calculate exp(x * 10**cnt) from exp(x) # exp(x * 10**k) = exp(x * 10**(k - 1)) ** 10 diff --git a/lib/bigdecimal/math.rb b/lib/bigdecimal/math.rb index 9dff366d..a4bb704a 100644 --- a/lib/bigdecimal/math.rb +++ b/lib/bigdecimal/math.rb @@ -94,6 +94,37 @@ def sqrt(x, prec) end end + private_class_method def _sin_binary_splitting(x, prec) # :nodoc: + return x if x.zero? + x2 = x.mult(x, prec) + # Find k that satisfies x2**k / (2k+1)! < 10**(-prec) + log10 = Math.log(10) + logx = BigDecimal::Internal.float_log(x.abs) + step = (1..).bsearch { |k| Math.lgamma(2 * k + 1)[0] - 2 * k * logx > prec * log10 } + # Construct denominator sequence for binary splitting + # sin(x) = x*(1-x2/(2*3)*(1-x2/(4*5)*(1-x2/(6*7)*(1-x2/(8*9)*(1-...))))) + ds = (1..step).map {|i| -(2 * i) * (2 * i + 1) } + x.mult(1 + BigDecimal::Internal.taylor_sum_binary_splitting(x2, ds, prec), prec) + end + + private_class_method def _sin_around_zero(x, prec) # :nodoc: + # Divide x into several parts + # sin(x.xxxxxxxx...) = sin(x.xx + 0.00xx + 0.0000xxxx + ...) + # Calculate sin of each part and restore sin(0.xxxxxxxx...) using addition theorem. + sin = BigDecimal(0) + cos = BigDecimal(1) + n = 2 + while x != 0 do + partial_x = x.truncate(n) + x -= partial_x + s = _sin_binary_splitting(partial_x, prec) + c = (1 - s * s).sqrt(prec) + sin, cos = (sin * c).add(cos * s, prec), (cos * c).sub(sin * s, prec) + n *= 2 + end + sin.clamp(BigDecimal(-1), BigDecimal(1)) + end + # call-seq: # cbrt(decimal, numeric) -> BigDecimal # @@ -156,26 +187,9 @@ def sin(x, prec) prec = BigDecimal::Internal.coerce_validate_prec(prec, :sin) x = BigDecimal::Internal.coerce_to_bigdecimal(x, prec, :sin) return BigDecimal::Internal.nan_computation_result if x.infinite? || x.nan? - n = prec + BigDecimal.double_fig - one = BigDecimal("1") - two = BigDecimal("2") + n = prec + BigDecimal.double_fig sign, x = _sin_periodic_reduction(x, n) - x1 = x - x2 = x.mult(x,n) - y = x - d = y - i = one - z = one - while d.nonzero? && ((m = n - (y.exponent - d.exponent).abs) > 0) - m = BigDecimal.double_fig if m < BigDecimal.double_fig - x1 = -x2.mult(x1,n) - i += two - z *= (i-one) * i - d = x1.div(z,m) - y += d - end - y = BigDecimal("1") if y > 1 - y.mult(sign, prec) + _sin_around_zero(x, n).mult(sign, prec) end # call-seq: @@ -193,8 +207,9 @@ def cos(x, prec) prec = BigDecimal::Internal.coerce_validate_prec(prec, :cos) x = BigDecimal::Internal.coerce_to_bigdecimal(x, prec, :cos) return BigDecimal::Internal.nan_computation_result if x.infinite? || x.nan? - sign, x = _sin_periodic_reduction(x, prec + BigDecimal.double_fig, add_half_pi: true) - sign * sin(x, prec) + n = prec + BigDecimal.double_fig + sign, x = _sin_periodic_reduction(x, n, add_half_pi: true) + _sin_around_zero(x, n).mult(sign, prec) end # call-seq: @@ -283,28 +298,21 @@ def atan(x, prec) x = BigDecimal::Internal.coerce_to_bigdecimal(x, prec, :atan) return BigDecimal::Internal.nan_computation_result if x.nan? n = prec + BigDecimal.double_fig - pi = PI(n) + return PI(n).div(2 * x.infinite?, prec) if x.infinite? + x = -x if neg = x < 0 - return pi.div(neg ? -2 : 2, prec) if x.infinite? - return pi.div(neg ? -4 : 4, prec) if x.round(n) == 1 - x = BigDecimal("1").div(x, n) if inv = x > 1 - x = (-1 + sqrt(1 + x.mult(x, n), n)).div(x, n) if dbl = x > 0.5 - y = x - d = y - t = x - r = BigDecimal("3") - x2 = x.mult(x,n) - while d.nonzero? && ((m = n - (y.exponent - d.exponent).abs) > 0) - m = BigDecimal.double_fig if m < BigDecimal.double_fig - t = -t.mult(x2,n) - d = t.div(r,m) - y += d - r += 2 + x = BigDecimal(1).div(x, n) if inv = x < -1 || x > 1 + + # Solve tan(y) - x = 0 with Newton's method + # Repeat: y -= (tan(y) - x) * cos(y)**2 + y = BigDecimal(Math.atan(x.to_f), 0) + BigDecimal::Internal.newton_loop(n) do |p| + s = sin(y, p) + c = (1 - s * s).sqrt(p) + y = y.sub(c * (s.sub(c * x.mult(1, p), p)), p) end - y *= 2 if dbl - y = pi / 2 - y if inv - y = -y if neg - y.mult(1, prec) + y = PI(n) / 2 - y if inv + y.mult(neg ? -1 : 1, prec) end # call-seq: @@ -804,7 +812,7 @@ def lgamma(x, prec) loggamma_k = 0 ck_exponents = (1..a-1).map do |k| loggamma_k += Math.log10(k - 1) if k > 1 - -loggamma_k - k / log10f + (k - 0.5) * Math.log10(a - k) - BigMath.log10(x_low_prec.add(k, low_prec), low_prec) + -loggamma_k - k / log10f + (k - 0.5) * Math.log10(a - k) - BigDecimal::Internal.float_log(x_low_prec.add(k, low_prec)) / log10f end # Estimate exponent of sum by Stirling's approximation diff --git a/test/bigdecimal/test_bigmath.rb b/test/bigdecimal/test_bigmath.rb index 5a6f4ee0..37f24e35 100644 --- a/test/bigdecimal/test_bigmath.rb +++ b/test/bigdecimal/test_bigmath.rb @@ -197,8 +197,13 @@ def test_sin assert_converge_in_precision {|n| sin(BigDecimal("1e-30"), n) } assert_converge_in_precision {|n| sin(BigDecimal(PI(50)), n) } assert_converge_in_precision {|n| sin(BigDecimal(PI(50) * 100), n) } - assert_operator(sin(PI(30) / 2, 30), :<=, 1) - assert_operator(sin(-PI(30) / 2, 30), :>=, -1) + [:up, :down].each do |mode| + BigDecimal.save_rounding_mode do + BigDecimal.mode(BigDecimal::ROUND_MODE, mode) + assert_operator(sin(PI(30) / 2, 30), :<=, 1) + assert_operator(sin(-PI(30) / 2, 30), :>=, -1) + end + end end def test_cos @@ -220,8 +225,13 @@ def test_cos assert_converge_in_precision {|n| cos(BigDecimal("1e50"), n) } assert_converge_in_precision {|n| cos(BigDecimal(PI(50) / 2), n) } assert_converge_in_precision {|n| cos(BigDecimal(PI(50) * 201 / 2), n) } - assert_operator(cos(PI(30), 30), :>=, -1) - assert_operator(cos(PI(30) * 2, 30), :<=, 1) + [:up, :down].each do |mode| + BigDecimal.save_rounding_mode do + BigDecimal.mode(BigDecimal::ROUND_MODE, mode) + assert_operator(cos(PI(30), 30), :>=, -1) + assert_operator(cos(PI(30) * 2, 30), :<=, 1) + end + end end def test_tan @@ -404,26 +414,20 @@ def test_exp def test_log assert_equal(0, log(BigDecimal("1.0"), 10)) - assert_in_epsilon(Math.log(10)*1000, log(BigDecimal("1e1000"), 10)) + assert_in_epsilon(1000 * Math.log(10), log(BigDecimal("1e1000"), 10)) + assert_in_epsilon(19999999999999 * Math.log(10), log(BigDecimal("1E19999999999999"), 10)) + assert_in_epsilon(-19999999999999 * Math.log(10), log(BigDecimal("1E-19999999999999"), 10)) assert_in_exact_precision( BigDecimal("2.3025850929940456840179914546843642076011014886287729760333279009675726096773524802359972050895982983419677840422862"), log(BigDecimal("10"), 100), 100 ) assert_converge_in_precision {|n| log(BigDecimal("2"), n) } - assert_converge_in_precision {|n| log(BigDecimal("1e-30") + 1, n) } - assert_converge_in_precision {|n| log(BigDecimal("1e-30"), n) } + assert_converge_in_precision {|n| log(1 + SQRT2 * BigDecimal("1e-30"), n) } + assert_converge_in_precision {|n| log(SQRT2 * BigDecimal("1e-30"), n) } assert_converge_in_precision {|n| log(BigDecimal("1e30"), n) } assert_converge_in_precision {|n| log(SQRT2, n) } assert_raise(Math::DomainError) {log(BigDecimal("-0.1"), 10)} - begin - x = BigDecimal("1E19999999999999") - rescue FloatDomainError - else - unless x.infinite? - assert_in_epsilon(Math.log(10) * 19999999999999, BigMath.log(x, 10)) - end - end end def test_log2 From 9f2bc3c376eae6fd9ae1b79876996a02d607cefb Mon Sep 17 00:00:00 2001 From: tompng Date: Fri, 19 Sep 2025 19:54:19 +0900 Subject: [PATCH 4/4] Drop Ruby 2.5 support bsearch for endless range is only available in ruby >= 2.6 --- .github/workflows/ci.yml | 2 +- bigdecimal.gemspec | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 68816212..65541539 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,7 +15,7 @@ jobs: uses: ruby/actions/.github/workflows/ruby_versions.yml@master with: engine: cruby-truffleruby - min_version: 2.5 + min_version: 2.6 versions: '["debug"]' host: diff --git a/bigdecimal.gemspec b/bigdecimal.gemspec index 6b20ac08..774fd223 100644 --- a/bigdecimal.gemspec +++ b/bigdecimal.gemspec @@ -53,7 +53,7 @@ Gem::Specification.new do |s| ] end - s.required_ruby_version = Gem::Requirement.new(">= 2.5.0") + s.required_ruby_version = Gem::Requirement.new(">= 2.6.0") s.metadata["changelog_uri"] = s.homepage + "/blob/master/CHANGES.md" end