From 118143aa533564117ca5e05b00a6b7d84bca31a6 Mon Sep 17 00:00:00 2001 From: Patrick Assuied Date: Wed, 31 Dec 2025 17:18:26 -0800 Subject: [PATCH 1/5] Fixed decimal encoding in a backwards compatible way --- logical_type.go | 87 +++++++++++++-- logical_type_test.go | 253 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 331 insertions(+), 9 deletions(-) diff --git a/logical_type.go b/logical_type.go index 80e9dc2..65ba27e 100644 --- a/logical_type.go +++ b/logical_type.go @@ -276,13 +276,15 @@ func makeDecimalBytesCodec(st map[string]*Codec, enclosingNamespace string, sche st[decimalSearchType] = c c.binaryFromNative = decimalBytesFromNative(bytesBinaryFromNative, toSignedBytes, precision, scale) - c.textualFromNative = decimalBytesFromNative(bytesTextualFromNative, toSignedBytes, precision, scale) - c.nativeFromBinary = nativeFromDecimalBytes(bytesNativeFromBinary, precision, scale) - c.nativeFromTextual = nativeFromDecimalBytes(bytesNativeFromTextual, precision, scale) + c.textualFromNative = decimalTextualFromNative(scale) + c.nativeFromBinary = nativeFromDecimalBytes(bytesNativeFromBinary, scale) + c.nativeFromTextual = nativeFromDecimalTextual() return c, nil } -func nativeFromDecimalBytes(fn toNativeFn, precision, scale int) toNativeFn { +// nativeFromDecimalBytes decodes bytes to *big.Rat with backwards compatibility +// for incorrectly encoded ASCII decimal strings. +func nativeFromDecimalBytes(fn toNativeFn, scale int) toNativeFn { return func(bytes []byte) (interface{}, []byte, error) { d, b, err := fn(bytes) if err != nil { @@ -292,11 +294,20 @@ func nativeFromDecimalBytes(fn toNativeFn, precision, scale int) toNativeFn { if !ok { return nil, bytes, fmt.Errorf("cannot transform to native decimal, expected []byte, received %T", d) } + + // Check if bytes look like ASCII decimal string (backwards compat) + if looksLikeASCIIDecimal(bs) { + r := new(big.Rat) + if _, ok := r.SetString(string(bs)); ok { + return r, b, nil + } + } + + // Normal two's-complement decoding num := big.NewInt(0) fromSignedBytes(num, bs) denom := new(big.Int).Exp(big.NewInt(10), big.NewInt(int64(scale)), nil) - r := new(big.Rat).SetFrac(num, denom) - return r, b, nil + return new(big.Rat).SetFrac(num, denom), b, nil } } @@ -320,6 +331,64 @@ func decimalBytesFromNative(fromNativeFn fromNativeFn, toBytesFn toBytesFn, prec } } +// decimalTextualFromNative encodes a *big.Rat to a JSON string representation +// like "40.20" according to the Avro 1.10.2 spec. +func decimalTextualFromNative(scale int) fromNativeFn { + return func(b []byte, d interface{}) ([]byte, error) { + r, ok := d.(*big.Rat) + if !ok { + return nil, fmt.Errorf("cannot transform to textual decimal, expected *big.Rat, received %T", d) + } + // Format as decimal string with proper scale + return stringTextualFromNative(b, r.FloatString(scale)) + } +} + +// nativeFromDecimalTextual decodes a JSON string like "40.20" to a *big.Rat +// according to the Avro 1.10.2 spec. +func nativeFromDecimalTextual() toNativeFn { + return func(buf []byte) (interface{}, []byte, error) { + s, remaining, err := stringNativeFromTextual(buf) + if err != nil { + return nil, nil, fmt.Errorf("cannot decode textual decimal: %s", err) + } + r := new(big.Rat) + if _, ok := r.SetString(s.(string)); !ok { + return nil, nil, fmt.Errorf("cannot parse decimal string: %q", s) + } + return r, remaining, nil + } +} + +// looksLikeASCIIDecimal checks if the bytes look like an ASCII decimal string +// (for backwards compatibility with incorrectly encoded data). +// Returns true if all bytes are printable ASCII and form a valid decimal pattern. +func looksLikeASCIIDecimal(bs []byte) bool { + if len(bs) == 0 { + return false + } + hasDigit := false + hasDot := false + for i, b := range bs { + switch { + case b >= '0' && b <= '9': + hasDigit = true + case b == '.': + if hasDot { + return false // multiple dots + } + hasDot = true + case b == '-' || b == '+': + if i != 0 { + return false // sign not at start + } + default: + return false // non-decimal character + } + } + return hasDigit +} + func makeDecimalFixedCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) { precision, scale, err := precisionAndScaleFromSchemaMap(schemaMap) if err != nil { @@ -337,9 +406,9 @@ func makeDecimalFixedCodec(st map[string]*Codec, enclosingNamespace string, sche return nil, err } c.binaryFromNative = decimalBytesFromNative(c.binaryFromNative, toSignedFixedBytes(size), precision, scale) - c.textualFromNative = decimalBytesFromNative(c.textualFromNative, toSignedFixedBytes(size), precision, scale) - c.nativeFromBinary = nativeFromDecimalBytes(c.nativeFromBinary, precision, scale) - c.nativeFromTextual = nativeFromDecimalBytes(c.nativeFromTextual, precision, scale) + c.textualFromNative = decimalTextualFromNative(scale) + c.nativeFromBinary = nativeFromDecimalBytes(c.nativeFromBinary, scale) + c.nativeFromTextual = nativeFromDecimalTextual() return c, nil } diff --git a/logical_type_test.go b/logical_type_test.go index a52fc1d..b9cb1b2 100644 --- a/logical_type_test.go +++ b/logical_type_test.go @@ -183,6 +183,259 @@ func TestDecimalBytesLogicalTypeInRecordDecodeWithDefault(t *testing.T) { testBinaryCodecPass(t, schema, map[string]interface{}{"mydecimal": big.NewRat(617, 50)}, []byte("\x04\x04\xd2")) } +func TestDecimalBytesTextualRoundTrip(t *testing.T) { + schema := `{"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}` + codec, err := NewCodec(schema) + if err != nil { + t.Fatal(err) + } + + testCases := []struct { + textual string + expected *big.Rat + }{ + {`"40.20"`, big.NewRat(4020, 100)}, + {`"12.34"`, big.NewRat(1234, 100)}, + {`"-12.34"`, big.NewRat(-1234, 100)}, + {`"0.00"`, big.NewRat(0, 1)}, + {`"99.99"`, big.NewRat(9999, 100)}, + } + + for _, tc := range testCases { + // Decode textual to native + native, _, err := codec.NativeFromTextual([]byte(tc.textual)) + if err != nil { + t.Fatalf("NativeFromTextual(%s): %v", tc.textual, err) + } + + rat, ok := native.(*big.Rat) + if !ok { + t.Fatalf("NativeFromTextual(%s): expected *big.Rat, got %T", tc.textual, native) + } + + if rat.Cmp(tc.expected) != 0 { + t.Errorf("NativeFromTextual(%s): got %v, want %v", tc.textual, rat, tc.expected) + } + + // Encode native to textual + textual, err := codec.TextualFromNative(nil, rat) + if err != nil { + t.Fatalf("TextualFromNative(%v): %v", rat, err) + } + + if string(textual) != tc.textual { + t.Errorf("TextualFromNative(%v): got %s, want %s", rat, textual, tc.textual) + } + } +} + +func TestDecimalFixedTextualRoundTrip(t *testing.T) { + schema := `{"type": "fixed", "size": 12, "logicalType": "decimal", "precision": 4, "scale": 2}` + codec, err := NewCodec(schema) + if err != nil { + t.Fatal(err) + } + + testCases := []struct { + textual string + expected *big.Rat + }{ + {`"40.20"`, big.NewRat(4020, 100)}, + {`"12.34"`, big.NewRat(1234, 100)}, + {`"-12.34"`, big.NewRat(-1234, 100)}, + {`"0.00"`, big.NewRat(0, 1)}, + } + + for _, tc := range testCases { + // Decode textual to native + native, _, err := codec.NativeFromTextual([]byte(tc.textual)) + if err != nil { + t.Fatalf("NativeFromTextual(%s): %v", tc.textual, err) + } + + rat, ok := native.(*big.Rat) + if !ok { + t.Fatalf("NativeFromTextual(%s): expected *big.Rat, got %T", tc.textual, native) + } + + if rat.Cmp(tc.expected) != 0 { + t.Errorf("NativeFromTextual(%s): got %v, want %v", tc.textual, rat, tc.expected) + } + + // Encode native to textual + textual, err := codec.TextualFromNative(nil, rat) + if err != nil { + t.Fatalf("TextualFromNative(%v): %v", rat, err) + } + + if string(textual) != tc.textual { + t.Errorf("TextualFromNative(%v): got %s, want %s", rat, textual, tc.textual) + } + } +} + +func TestDecimalBytesBackwardsCompatibility(t *testing.T) { + // Test that binary data incorrectly encoded as ASCII decimal strings + // can still be decoded correctly (backwards compatibility) + schema := `{"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}` + codec, err := NewCodec(schema) + if err != nil { + t.Fatal(err) + } + + // Simulate incorrectly encoded data: "40.20" as ASCII bytes + // Length prefix (10 = 0x14 in varint) + ASCII bytes for "40.20" + incorrectlyEncodedBytes := append([]byte{0x0a}, []byte("40.20")...) + + native, _, err := codec.NativeFromBinary(incorrectlyEncodedBytes) + if err != nil { + t.Fatalf("NativeFromBinary (backwards compat): %v", err) + } + + rat, ok := native.(*big.Rat) + if !ok { + t.Fatalf("NativeFromBinary: expected *big.Rat, got %T", native) + } + + expected := big.NewRat(4020, 100) + if rat.Cmp(expected) != 0 { + t.Errorf("NativeFromBinary (backwards compat): got %v, want %v", rat, expected) + } +} + +func TestDecimalBytesCorrectBinaryEncoding(t *testing.T) { + // Test that correctly encoded binary data (two's complement) still works + schema := `{"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}` + codec, err := NewCodec(schema) + if err != nil { + t.Fatal(err) + } + + // 40.20 = 4020 with scale 2 + // 4020 in two's complement = 0x0FB4 (big-endian) + // Avro bytes: length prefix (4 = 0x04) + 0x0F, 0xB4 + correctlyEncodedBytes := []byte{0x04, 0x0f, 0xb4} + + native, _, err := codec.NativeFromBinary(correctlyEncodedBytes) + if err != nil { + t.Fatalf("NativeFromBinary: %v", err) + } + + rat, ok := native.(*big.Rat) + if !ok { + t.Fatalf("NativeFromBinary: expected *big.Rat, got %T", native) + } + + expected := big.NewRat(4020, 100) + if rat.Cmp(expected) != 0 { + t.Errorf("NativeFromBinary: got %v, want %v", rat, expected) + } +} + +func TestDecimalTextualToBinaryRoundTrip(t *testing.T) { + // Test the full flow: textual -> native -> binary -> native -> textual + schema := `{"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}` + codec, err := NewCodec(schema) + if err != nil { + t.Fatal(err) + } + + originalTextual := []byte(`"40.20"`) + + // Step 1: Textual -> Native + native1, _, err := codec.NativeFromTextual(originalTextual) + if err != nil { + t.Fatalf("NativeFromTextual: %v", err) + } + + // Step 2: Native -> Binary + binary, err := codec.BinaryFromNative(nil, native1) + if err != nil { + t.Fatalf("BinaryFromNative: %v", err) + } + + // Verify binary is two's complement, not ASCII string + // 4020 = 0x0FB4 in hex + expectedBinary := []byte{0x04, 0x0f, 0xb4} + if string(binary) != string(expectedBinary) { + t.Errorf("BinaryFromNative: got %x, want %x", binary, expectedBinary) + } + + // Step 3: Binary -> Native + native2, _, err := codec.NativeFromBinary(binary) + if err != nil { + t.Fatalf("NativeFromBinary: %v", err) + } + + // Step 4: Native -> Textual + textual, err := codec.TextualFromNative(nil, native2) + if err != nil { + t.Fatalf("TextualFromNative: %v", err) + } + + if string(textual) != string(originalTextual) { + t.Errorf("Round-trip failed: got %s, want %s", textual, originalTextual) + } +} + +func TestLooksLikeASCIIDecimal(t *testing.T) { + testCases := []struct { + input []byte + expected bool + }{ + {[]byte("40.20"), true}, + {[]byte("-40.20"), true}, + {[]byte("+40.20"), true}, + {[]byte("0"), true}, + {[]byte("123456"), true}, + {[]byte(".5"), true}, + {[]byte("5."), true}, + {[]byte(""), false}, + {[]byte("-"), false}, + {[]byte("40.20.30"), false}, // multiple dots + {[]byte("40-20"), false}, // sign not at start + {[]byte("40a20"), false}, // non-decimal char + {[]byte("\x0f\xb4"), false}, // binary data (two's complement) + {[]byte{0x00}, false}, // null byte + {[]byte{0xff, 0xff}, false}, // high bytes (negative two's complement) + {[]byte("12.34e5"), false}, // scientific notation not supported + } + + for _, tc := range testCases { + result := looksLikeASCIIDecimal(tc.input) + if result != tc.expected { + t.Errorf("looksLikeASCIIDecimal(%q): got %v, want %v", tc.input, result, tc.expected) + } + } +} + +func TestDecimalNegativeBackwardsCompatibility(t *testing.T) { + // Test backwards compatibility with negative numbers encoded as ASCII + schema := `{"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}` + codec, err := NewCodec(schema) + if err != nil { + t.Fatal(err) + } + + // Simulate incorrectly encoded data: "-40.20" as ASCII bytes + incorrectlyEncodedBytes := append([]byte{0x0c}, []byte("-40.20")...) + + native, _, err := codec.NativeFromBinary(incorrectlyEncodedBytes) + if err != nil { + t.Fatalf("NativeFromBinary (backwards compat): %v", err) + } + + rat, ok := native.(*big.Rat) + if !ok { + t.Fatalf("NativeFromBinary: expected *big.Rat, got %T", native) + } + + expected := big.NewRat(-4020, 100) + if rat.Cmp(expected) != 0 { + t.Errorf("NativeFromBinary (backwards compat): got %v, want %v", rat, expected) + } +} + func TestValidatedStringLogicalTypeInRecordEncode(t *testing.T) { schema := `{ "type": "record", From c56c6b61c95a9f402c98a59589f100d32c5dd978 Mon Sep 17 00:00:00 2001 From: Patrick Assuied Date: Wed, 31 Dec 2025 17:25:02 -0800 Subject: [PATCH 2/5] format fix --- logical_type.go | 2 +- logical_type_test.go | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/logical_type.go b/logical_type.go index 65ba27e..6266735 100644 --- a/logical_type.go +++ b/logical_type.go @@ -311,7 +311,7 @@ func nativeFromDecimalBytes(fn toNativeFn, scale int) toNativeFn { } } -func decimalBytesFromNative(fromNativeFn fromNativeFn, toBytesFn toBytesFn, precision, scale int) fromNativeFn { +func decimalBytesFromNative(fromNativeFn fromNativeFn, toBytesFn toBytesFn, _, scale int) fromNativeFn { return func(b []byte, d interface{}) ([]byte, error) { r, ok := d.(*big.Rat) if !ok { diff --git a/logical_type_test.go b/logical_type_test.go index b9cb1b2..f532f03 100644 --- a/logical_type_test.go +++ b/logical_type_test.go @@ -392,13 +392,13 @@ func TestLooksLikeASCIIDecimal(t *testing.T) { {[]byte("5."), true}, {[]byte(""), false}, {[]byte("-"), false}, - {[]byte("40.20.30"), false}, // multiple dots - {[]byte("40-20"), false}, // sign not at start - {[]byte("40a20"), false}, // non-decimal char - {[]byte("\x0f\xb4"), false}, // binary data (two's complement) - {[]byte{0x00}, false}, // null byte - {[]byte{0xff, 0xff}, false}, // high bytes (negative two's complement) - {[]byte("12.34e5"), false}, // scientific notation not supported + {[]byte("40.20.30"), false}, // multiple dots + {[]byte("40-20"), false}, // sign not at start + {[]byte("40a20"), false}, // non-decimal char + {[]byte("\x0f\xb4"), false}, // binary data (two's complement) + {[]byte{0x00}, false}, // null byte + {[]byte{0xff, 0xff}, false}, // high bytes (negative two's complement) + {[]byte("12.34e5"), false}, // scientific notation not supported } for _, tc := range testCases { From 7229ca3df45b563b2c2e883578495ffd7f00c902 Mon Sep 17 00:00:00 2001 From: Patrick Assuied Date: Fri, 2 Jan 2026 18:03:30 -0800 Subject: [PATCH 3/5] Added option for enabling backwards compatiblity decoding for obvious use cases (i.e. has a `.`) --- codec.go | 14 +++++++++-- logical_type.go | 31 ++++++++++++++++-------- logical_type_test.go | 57 +++++++++++++++++++++++++++++++++++++------- 3 files changed, 82 insertions(+), 20 deletions(-) diff --git a/codec.go b/codec.go index 192bcf6..f8a21e0 100644 --- a/codec.go +++ b/codec.go @@ -49,6 +49,15 @@ type CodecOption struct { // When true, the string literal "null" in textual Avro data will be coerced to Go's nil. // Primarily used to handle edge cases where some Avro implementations allow string representations of null. EnableStringNull bool + + // EnableDecimalBinaryToTextualBackwardsCompatASCIIDecoding controls backwards compatibility for decimal + // binary decoding. When true, the decoder will first check if binary bytes look like an + // ASCII decimal string (e.g., "40.20") and parse them as such. This handles legacy data + // that was incorrectly encoded as ASCII strings instead of two's-complement bytes. + // WARNING: This can cause incorrect decoding if a valid two's-complement value happens + // to look like ASCII digits. Only enable this if you have legacy incorrectly-encoded data. + // Default: false (use correct two's-complement decoding only) + EnableDecimalBinaryToTextualBackwardsCompatASCIIDecoding bool } // Codec supports decoding binary and text Avro data to Go native data types, @@ -83,6 +92,7 @@ type codecBuilder struct { func DefaultCodecOption() *CodecOption { return &CodecOption{ EnableStringNull: true, + EnableDecimalBinaryToTextualBackwardsCompatASCIIDecoding: false, } } @@ -739,9 +749,9 @@ func buildCodecForTypeDescribedByString(st map[string]*Codec, enclosingNamespace case "record": return makeRecordCodec(st, enclosingNamespace, schemaMap, cb) case "bytes.decimal": - return makeDecimalBytesCodec(st, enclosingNamespace, schemaMap) + return makeDecimalBytesCodec(st, enclosingNamespace, schemaMap, cb) case "fixed.decimal": - return makeDecimalFixedCodec(st, enclosingNamespace, schemaMap) + return makeDecimalFixedCodec(st, enclosingNamespace, schemaMap, cb) case "string.validated-string": return makeValidatedStringCodec(st, enclosingNamespace, schemaMap) default: diff --git a/logical_type.go b/logical_type.go index 6266735..3ed10e2 100644 --- a/logical_type.go +++ b/logical_type.go @@ -258,7 +258,7 @@ func precisionAndScaleFromSchemaMap(schemaMap map[string]interface{}) (int, int, var one = big.NewInt(1) -func makeDecimalBytesCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) { +func makeDecimalBytesCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}, cb *codecBuilder) (*Codec, error) { precision, scale, err := precisionAndScaleFromSchemaMap(schemaMap) if err != nil { return nil, err @@ -275,16 +275,20 @@ func makeDecimalBytesCodec(st map[string]*Codec, enclosingNamespace string, sche decimalSearchType := fmt.Sprintf("bytes.decimal.%d.%d", precision, scale) st[decimalSearchType] = c + // Check if backwards compatibility mode is enabled + enableBackwardsCompat := cb != nil && cb.option != nil && cb.option.EnableDecimalBinaryToTextualBackwardsCompatASCIIDecoding + c.binaryFromNative = decimalBytesFromNative(bytesBinaryFromNative, toSignedBytes, precision, scale) c.textualFromNative = decimalTextualFromNative(scale) - c.nativeFromBinary = nativeFromDecimalBytes(bytesNativeFromBinary, scale) + c.nativeFromBinary = nativeFromDecimalBytes(bytesNativeFromBinary, scale, enableBackwardsCompat) c.nativeFromTextual = nativeFromDecimalTextual() return c, nil } -// nativeFromDecimalBytes decodes bytes to *big.Rat with backwards compatibility -// for incorrectly encoded ASCII decimal strings. -func nativeFromDecimalBytes(fn toNativeFn, scale int) toNativeFn { +// nativeFromDecimalBytes decodes bytes to *big.Rat. +// If enableBackwardsCompat is true, it first checks if bytes look like an ASCII decimal +// string (for backwards compatibility with incorrectly encoded legacy data). +func nativeFromDecimalBytes(fn toNativeFn, scale int, enableBackwardsCompat bool) toNativeFn { return func(bytes []byte) (interface{}, []byte, error) { d, b, err := fn(bytes) if err != nil { @@ -296,7 +300,7 @@ func nativeFromDecimalBytes(fn toNativeFn, scale int) toNativeFn { } // Check if bytes look like ASCII decimal string (backwards compat) - if looksLikeASCIIDecimal(bs) { + if enableBackwardsCompat && looksLikeASCIIDecimal(bs) { r := new(big.Rat) if _, ok := r.SetString(string(bs)); ok { return r, b, nil @@ -362,7 +366,9 @@ func nativeFromDecimalTextual() toNativeFn { // looksLikeASCIIDecimal checks if the bytes look like an ASCII decimal string // (for backwards compatibility with incorrectly encoded data). -// Returns true if all bytes are printable ASCII and form a valid decimal pattern. +// Returns true if all bytes are printable ASCII, form a valid decimal pattern, +// and contain a decimal point. Requiring a decimal point reduces false positives +// since 0x2E ('.') is unlikely to appear in valid two's-complement encoded numbers. func looksLikeASCIIDecimal(bs []byte) bool { if len(bs) == 0 { return false @@ -386,10 +392,11 @@ func looksLikeASCIIDecimal(bs []byte) bool { return false // non-decimal character } } - return hasDigit + // Require both a digit and a decimal point to reduce false positives + return hasDigit && hasDot } -func makeDecimalFixedCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) { +func makeDecimalFixedCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}, cb *codecBuilder) (*Codec, error) { precision, scale, err := precisionAndScaleFromSchemaMap(schemaMap) if err != nil { return nil, err @@ -405,9 +412,13 @@ func makeDecimalFixedCodec(st map[string]*Codec, enclosingNamespace string, sche if err != nil { return nil, err } + + // Check if backwards compatibility mode is enabled + enableBackwardsCompat := cb != nil && cb.option != nil && cb.option.EnableDecimalBinaryToTextualBackwardsCompatASCIIDecoding + c.binaryFromNative = decimalBytesFromNative(c.binaryFromNative, toSignedFixedBytes(size), precision, scale) c.textualFromNative = decimalTextualFromNative(scale) - c.nativeFromBinary = nativeFromDecimalBytes(c.nativeFromBinary, scale) + c.nativeFromBinary = nativeFromDecimalBytes(c.nativeFromBinary, scale, enableBackwardsCompat) c.nativeFromTextual = nativeFromDecimalTextual() return c, nil } diff --git a/logical_type_test.go b/logical_type_test.go index f532f03..6a0dad9 100644 --- a/logical_type_test.go +++ b/logical_type_test.go @@ -276,15 +276,18 @@ func TestDecimalFixedTextualRoundTrip(t *testing.T) { func TestDecimalBytesBackwardsCompatibility(t *testing.T) { // Test that binary data incorrectly encoded as ASCII decimal strings - // can still be decoded correctly (backwards compatibility) + // can still be decoded correctly when backwards compatibility is enabled schema := `{"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}` - codec, err := NewCodec(schema) + + // Create codec with backwards compatibility enabled + opt := &CodecOption{EnableDecimalBinaryToTextualBackwardsCompatASCIIDecoding: true} + codec, err := NewCodecWithOptions(schema, opt) if err != nil { t.Fatal(err) } // Simulate incorrectly encoded data: "40.20" as ASCII bytes - // Length prefix (10 = 0x14 in varint) + ASCII bytes for "40.20" + // Length prefix (10 = 0x0a in varint) + ASCII bytes for "40.20" incorrectlyEncodedBytes := append([]byte{0x0a}, []byte("40.20")...) native, _, err := codec.NativeFromBinary(incorrectlyEncodedBytes) @@ -386,12 +389,14 @@ func TestLooksLikeASCIIDecimal(t *testing.T) { {[]byte("40.20"), true}, {[]byte("-40.20"), true}, {[]byte("+40.20"), true}, - {[]byte("0"), true}, - {[]byte("123456"), true}, {[]byte(".5"), true}, {[]byte("5."), true}, - {[]byte(""), false}, - {[]byte("-"), false}, + {[]byte("0.0"), true}, + {[]byte("0"), false}, // no decimal point - could be valid two's complement + {[]byte("123456"), false}, // no decimal point - could be valid two's complement + {[]byte(""), false}, // empty + {[]byte("-"), false}, // no digit + {[]byte("."), false}, // no digit {[]byte("40.20.30"), false}, // multiple dots {[]byte("40-20"), false}, // sign not at start {[]byte("40a20"), false}, // non-decimal char @@ -409,10 +414,46 @@ func TestLooksLikeASCIIDecimal(t *testing.T) { } } +func TestDecimalDefaultNoBackwardsCompat(t *testing.T) { + // Test that by default, ASCII-looking bytes are NOT interpreted as ASCII + // but rather as two's-complement (which is the correct spec behavior) + schema := `{"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}` + + // Create codec with default options (no backwards compat) + codec, err := NewCodec(schema) + if err != nil { + t.Fatal(err) + } + + // "09" as ASCII bytes is [0x30, 0x39] which is 12345 in decimal + // This could be misinterpreted as the string "09" if backwards compat were on + // But with default settings, it should be decoded as 12345/100 = 123.45 + asciiLookingBytes := []byte{0x04, 0x30, 0x39} // length=2, bytes="09" + + native, _, err := codec.NativeFromBinary(asciiLookingBytes) + if err != nil { + t.Fatalf("NativeFromBinary: %v", err) + } + + rat, ok := native.(*big.Rat) + if !ok { + t.Fatalf("NativeFromBinary: expected *big.Rat, got %T", native) + } + + // 0x3039 = 12345, with scale 2 = 123.45 = 12345/100 + expected := big.NewRat(12345, 100) + if rat.Cmp(expected) != 0 { + t.Errorf("NativeFromBinary (default, no backwards compat): got %v, want %v", rat, expected) + } +} + func TestDecimalNegativeBackwardsCompatibility(t *testing.T) { // Test backwards compatibility with negative numbers encoded as ASCII schema := `{"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}` - codec, err := NewCodec(schema) + + // Create codec with backwards compatibility enabled + opt := &CodecOption{EnableDecimalBinaryToTextualBackwardsCompatASCIIDecoding: true} + codec, err := NewCodecWithOptions(schema, opt) if err != nil { t.Fatal(err) } From 6a6efc37d827e4a7365da47e0cdf550d84b67e57 Mon Sep 17 00:00:00 2001 From: Patrick Assuied Date: Sun, 4 Jan 2026 10:33:44 -0800 Subject: [PATCH 4/5] Change implementation to be fully backwards compatible and add option to be Spec compliant --- codec.go | 19 +++--- logical_type.go | 92 ++++++++++--------------- logical_type_test.go | 157 ++++++------------------------------------- 3 files changed, 63 insertions(+), 205 deletions(-) diff --git a/codec.go b/codec.go index f8a21e0..28440fc 100644 --- a/codec.go +++ b/codec.go @@ -50,14 +50,13 @@ type CodecOption struct { // Primarily used to handle edge cases where some Avro implementations allow string representations of null. EnableStringNull bool - // EnableDecimalBinaryToTextualBackwardsCompatASCIIDecoding controls backwards compatibility for decimal - // binary decoding. When true, the decoder will first check if binary bytes look like an - // ASCII decimal string (e.g., "40.20") and parse them as such. This handles legacy data - // that was incorrectly encoded as ASCII strings instead of two's-complement bytes. - // WARNING: This can cause incorrect decoding if a valid two's-complement value happens - // to look like ASCII digits. Only enable this if you have legacy incorrectly-encoded data. - // Default: false (use correct two's-complement decoding only) - EnableDecimalBinaryToTextualBackwardsCompatASCIIDecoding bool + // EnableDecimalBinarySpecCompliantEncoding controls whether decimal values use + // Avro 1.10.2 spec-compliant encoding. When true: + // - Binary encoding uses two's-complement representation of the unscaled integer + // - JSON textual encoding uses human-readable decimal strings like "40.20" + // When false (default), legacy encoding is used for backwards compatibility. + // Default: false (legacy encoding for backwards compatibility) + EnableDecimalBinarySpecCompliantEncoding bool } // Codec supports decoding binary and text Avro data to Go native data types, @@ -91,8 +90,8 @@ type codecBuilder struct { // DefaultCodecOption returns a CodecOption with recommended default settings. func DefaultCodecOption() *CodecOption { return &CodecOption{ - EnableStringNull: true, - EnableDecimalBinaryToTextualBackwardsCompatASCIIDecoding: false, + EnableStringNull: true, + EnableDecimalBinarySpecCompliantEncoding: false, } } diff --git a/logical_type.go b/logical_type.go index 3ed10e2..4311b1c 100644 --- a/logical_type.go +++ b/logical_type.go @@ -275,20 +275,27 @@ func makeDecimalBytesCodec(st map[string]*Codec, enclosingNamespace string, sche decimalSearchType := fmt.Sprintf("bytes.decimal.%d.%d", precision, scale) st[decimalSearchType] = c - // Check if backwards compatibility mode is enabled - enableBackwardsCompat := cb != nil && cb.option != nil && cb.option.EnableDecimalBinaryToTextualBackwardsCompatASCIIDecoding - - c.binaryFromNative = decimalBytesFromNative(bytesBinaryFromNative, toSignedBytes, precision, scale) - c.textualFromNative = decimalTextualFromNative(scale) - c.nativeFromBinary = nativeFromDecimalBytes(bytesNativeFromBinary, scale, enableBackwardsCompat) - c.nativeFromTextual = nativeFromDecimalTextual() + // Check if spec-compliant encoding is enabled + specCompliant := cb != nil && cb.option != nil && cb.option.EnableDecimalBinarySpecCompliantEncoding + + if specCompliant { + // Spec-compliant encoding: two's complement binary, human-readable textual + c.binaryFromNative = decimalBytesFromNative(bytesBinaryFromNative, toSignedBytes, precision, scale) + c.textualFromNative = decimalTextualFromNative(scale) + c.nativeFromBinary = nativeFromDecimalBytes(bytesNativeFromBinary, scale) + c.nativeFromTextual = nativeFromDecimalTextual() + } else { + // Legacy encoding (default): for backwards compatibility + c.binaryFromNative = decimalBytesFromNative(bytesBinaryFromNative, toSignedBytes, precision, scale) + c.textualFromNative = decimalBytesFromNative(bytesTextualFromNative, toSignedBytes, precision, scale) + c.nativeFromBinary = nativeFromDecimalBytes(bytesNativeFromBinary, scale) + c.nativeFromTextual = nativeFromDecimalBytes(bytesNativeFromTextual, scale) + } return c, nil } -// nativeFromDecimalBytes decodes bytes to *big.Rat. -// If enableBackwardsCompat is true, it first checks if bytes look like an ASCII decimal -// string (for backwards compatibility with incorrectly encoded legacy data). -func nativeFromDecimalBytes(fn toNativeFn, scale int, enableBackwardsCompat bool) toNativeFn { +// nativeFromDecimalBytes decodes bytes to *big.Rat using two's-complement representation. +func nativeFromDecimalBytes(fn toNativeFn, scale int) toNativeFn { return func(bytes []byte) (interface{}, []byte, error) { d, b, err := fn(bytes) if err != nil { @@ -299,15 +306,7 @@ func nativeFromDecimalBytes(fn toNativeFn, scale int, enableBackwardsCompat bool return nil, bytes, fmt.Errorf("cannot transform to native decimal, expected []byte, received %T", d) } - // Check if bytes look like ASCII decimal string (backwards compat) - if enableBackwardsCompat && looksLikeASCIIDecimal(bs) { - r := new(big.Rat) - if _, ok := r.SetString(string(bs)); ok { - return r, b, nil - } - } - - // Normal two's-complement decoding + // Two's-complement decoding num := big.NewInt(0) fromSignedBytes(num, bs) denom := new(big.Int).Exp(big.NewInt(10), big.NewInt(int64(scale)), nil) @@ -364,38 +363,6 @@ func nativeFromDecimalTextual() toNativeFn { } } -// looksLikeASCIIDecimal checks if the bytes look like an ASCII decimal string -// (for backwards compatibility with incorrectly encoded data). -// Returns true if all bytes are printable ASCII, form a valid decimal pattern, -// and contain a decimal point. Requiring a decimal point reduces false positives -// since 0x2E ('.') is unlikely to appear in valid two's-complement encoded numbers. -func looksLikeASCIIDecimal(bs []byte) bool { - if len(bs) == 0 { - return false - } - hasDigit := false - hasDot := false - for i, b := range bs { - switch { - case b >= '0' && b <= '9': - hasDigit = true - case b == '.': - if hasDot { - return false // multiple dots - } - hasDot = true - case b == '-' || b == '+': - if i != 0 { - return false // sign not at start - } - default: - return false // non-decimal character - } - } - // Require both a digit and a decimal point to reduce false positives - return hasDigit && hasDot -} - func makeDecimalFixedCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}, cb *codecBuilder) (*Codec, error) { precision, scale, err := precisionAndScaleFromSchemaMap(schemaMap) if err != nil { @@ -413,13 +380,22 @@ func makeDecimalFixedCodec(st map[string]*Codec, enclosingNamespace string, sche return nil, err } - // Check if backwards compatibility mode is enabled - enableBackwardsCompat := cb != nil && cb.option != nil && cb.option.EnableDecimalBinaryToTextualBackwardsCompatASCIIDecoding + // Check if spec-compliant encoding is enabled + specCompliant := cb != nil && cb.option != nil && cb.option.EnableDecimalBinarySpecCompliantEncoding - c.binaryFromNative = decimalBytesFromNative(c.binaryFromNative, toSignedFixedBytes(size), precision, scale) - c.textualFromNative = decimalTextualFromNative(scale) - c.nativeFromBinary = nativeFromDecimalBytes(c.nativeFromBinary, scale, enableBackwardsCompat) - c.nativeFromTextual = nativeFromDecimalTextual() + if specCompliant { + // Spec-compliant encoding: two's complement binary, human-readable textual + c.binaryFromNative = decimalBytesFromNative(c.binaryFromNative, toSignedFixedBytes(size), precision, scale) + c.textualFromNative = decimalTextualFromNative(scale) + c.nativeFromBinary = nativeFromDecimalBytes(c.nativeFromBinary, scale) + c.nativeFromTextual = nativeFromDecimalTextual() + } else { + // Legacy encoding (default): for backwards compatibility + c.binaryFromNative = decimalBytesFromNative(c.binaryFromNative, toSignedFixedBytes(size), precision, scale) + c.textualFromNative = decimalBytesFromNative(c.textualFromNative, toSignedFixedBytes(size), precision, scale) + c.nativeFromBinary = nativeFromDecimalBytes(c.nativeFromBinary, scale) + c.nativeFromTextual = nativeFromDecimalBytes(c.nativeFromTextual, scale) + } return c, nil } diff --git a/logical_type_test.go b/logical_type_test.go index 6a0dad9..60f16a5 100644 --- a/logical_type_test.go +++ b/logical_type_test.go @@ -183,9 +183,13 @@ func TestDecimalBytesLogicalTypeInRecordDecodeWithDefault(t *testing.T) { testBinaryCodecPass(t, schema, map[string]interface{}{"mydecimal": big.NewRat(617, 50)}, []byte("\x04\x04\xd2")) } -func TestDecimalBytesTextualRoundTrip(t *testing.T) { +func TestDecimalBytesSpecCompliantTextualRoundTrip(t *testing.T) { + // Test spec-compliant textual encoding with human-readable decimal strings schema := `{"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}` - codec, err := NewCodec(schema) + + // Create codec with spec-compliant encoding enabled + opt := &CodecOption{EnableDecimalBinarySpecCompliantEncoding: true} + codec, err := NewCodecWithOptions(schema, opt) if err != nil { t.Fatal(err) } @@ -229,9 +233,13 @@ func TestDecimalBytesTextualRoundTrip(t *testing.T) { } } -func TestDecimalFixedTextualRoundTrip(t *testing.T) { +func TestDecimalFixedSpecCompliantTextualRoundTrip(t *testing.T) { + // Test spec-compliant textual encoding with human-readable decimal strings schema := `{"type": "fixed", "size": 12, "logicalType": "decimal", "precision": 4, "scale": 2}` - codec, err := NewCodec(schema) + + // Create codec with spec-compliant encoding enabled + opt := &CodecOption{EnableDecimalBinarySpecCompliantEncoding: true} + codec, err := NewCodecWithOptions(schema, opt) if err != nil { t.Fatal(err) } @@ -274,40 +282,8 @@ func TestDecimalFixedTextualRoundTrip(t *testing.T) { } } -func TestDecimalBytesBackwardsCompatibility(t *testing.T) { - // Test that binary data incorrectly encoded as ASCII decimal strings - // can still be decoded correctly when backwards compatibility is enabled - schema := `{"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}` - - // Create codec with backwards compatibility enabled - opt := &CodecOption{EnableDecimalBinaryToTextualBackwardsCompatASCIIDecoding: true} - codec, err := NewCodecWithOptions(schema, opt) - if err != nil { - t.Fatal(err) - } - - // Simulate incorrectly encoded data: "40.20" as ASCII bytes - // Length prefix (10 = 0x0a in varint) + ASCII bytes for "40.20" - incorrectlyEncodedBytes := append([]byte{0x0a}, []byte("40.20")...) - - native, _, err := codec.NativeFromBinary(incorrectlyEncodedBytes) - if err != nil { - t.Fatalf("NativeFromBinary (backwards compat): %v", err) - } - - rat, ok := native.(*big.Rat) - if !ok { - t.Fatalf("NativeFromBinary: expected *big.Rat, got %T", native) - } - - expected := big.NewRat(4020, 100) - if rat.Cmp(expected) != 0 { - t.Errorf("NativeFromBinary (backwards compat): got %v, want %v", rat, expected) - } -} - func TestDecimalBytesCorrectBinaryEncoding(t *testing.T) { - // Test that correctly encoded binary data (two's complement) still works + // Test that binary encoding uses two's complement (same for both legacy and spec-compliant) schema := `{"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}` codec, err := NewCodec(schema) if err != nil { @@ -335,10 +311,13 @@ func TestDecimalBytesCorrectBinaryEncoding(t *testing.T) { } } -func TestDecimalTextualToBinaryRoundTrip(t *testing.T) { - // Test the full flow: textual -> native -> binary -> native -> textual +func TestDecimalSpecCompliantTextualToBinaryRoundTrip(t *testing.T) { + // Test the full flow with spec-compliant encoding: textual -> native -> binary -> native -> textual schema := `{"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}` - codec, err := NewCodec(schema) + + // Create codec with spec-compliant encoding enabled + opt := &CodecOption{EnableDecimalBinarySpecCompliantEncoding: true} + codec, err := NewCodecWithOptions(schema, opt) if err != nil { t.Fatal(err) } @@ -357,7 +336,7 @@ func TestDecimalTextualToBinaryRoundTrip(t *testing.T) { t.Fatalf("BinaryFromNative: %v", err) } - // Verify binary is two's complement, not ASCII string + // Verify binary is two's complement // 4020 = 0x0FB4 in hex expectedBinary := []byte{0x04, 0x0f, 0xb4} if string(binary) != string(expectedBinary) { @@ -381,102 +360,6 @@ func TestDecimalTextualToBinaryRoundTrip(t *testing.T) { } } -func TestLooksLikeASCIIDecimal(t *testing.T) { - testCases := []struct { - input []byte - expected bool - }{ - {[]byte("40.20"), true}, - {[]byte("-40.20"), true}, - {[]byte("+40.20"), true}, - {[]byte(".5"), true}, - {[]byte("5."), true}, - {[]byte("0.0"), true}, - {[]byte("0"), false}, // no decimal point - could be valid two's complement - {[]byte("123456"), false}, // no decimal point - could be valid two's complement - {[]byte(""), false}, // empty - {[]byte("-"), false}, // no digit - {[]byte("."), false}, // no digit - {[]byte("40.20.30"), false}, // multiple dots - {[]byte("40-20"), false}, // sign not at start - {[]byte("40a20"), false}, // non-decimal char - {[]byte("\x0f\xb4"), false}, // binary data (two's complement) - {[]byte{0x00}, false}, // null byte - {[]byte{0xff, 0xff}, false}, // high bytes (negative two's complement) - {[]byte("12.34e5"), false}, // scientific notation not supported - } - - for _, tc := range testCases { - result := looksLikeASCIIDecimal(tc.input) - if result != tc.expected { - t.Errorf("looksLikeASCIIDecimal(%q): got %v, want %v", tc.input, result, tc.expected) - } - } -} - -func TestDecimalDefaultNoBackwardsCompat(t *testing.T) { - // Test that by default, ASCII-looking bytes are NOT interpreted as ASCII - // but rather as two's-complement (which is the correct spec behavior) - schema := `{"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}` - - // Create codec with default options (no backwards compat) - codec, err := NewCodec(schema) - if err != nil { - t.Fatal(err) - } - - // "09" as ASCII bytes is [0x30, 0x39] which is 12345 in decimal - // This could be misinterpreted as the string "09" if backwards compat were on - // But with default settings, it should be decoded as 12345/100 = 123.45 - asciiLookingBytes := []byte{0x04, 0x30, 0x39} // length=2, bytes="09" - - native, _, err := codec.NativeFromBinary(asciiLookingBytes) - if err != nil { - t.Fatalf("NativeFromBinary: %v", err) - } - - rat, ok := native.(*big.Rat) - if !ok { - t.Fatalf("NativeFromBinary: expected *big.Rat, got %T", native) - } - - // 0x3039 = 12345, with scale 2 = 123.45 = 12345/100 - expected := big.NewRat(12345, 100) - if rat.Cmp(expected) != 0 { - t.Errorf("NativeFromBinary (default, no backwards compat): got %v, want %v", rat, expected) - } -} - -func TestDecimalNegativeBackwardsCompatibility(t *testing.T) { - // Test backwards compatibility with negative numbers encoded as ASCII - schema := `{"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}` - - // Create codec with backwards compatibility enabled - opt := &CodecOption{EnableDecimalBinaryToTextualBackwardsCompatASCIIDecoding: true} - codec, err := NewCodecWithOptions(schema, opt) - if err != nil { - t.Fatal(err) - } - - // Simulate incorrectly encoded data: "-40.20" as ASCII bytes - incorrectlyEncodedBytes := append([]byte{0x0c}, []byte("-40.20")...) - - native, _, err := codec.NativeFromBinary(incorrectlyEncodedBytes) - if err != nil { - t.Fatalf("NativeFromBinary (backwards compat): %v", err) - } - - rat, ok := native.(*big.Rat) - if !ok { - t.Fatalf("NativeFromBinary: expected *big.Rat, got %T", native) - } - - expected := big.NewRat(-4020, 100) - if rat.Cmp(expected) != 0 { - t.Errorf("NativeFromBinary (backwards compat): got %v, want %v", rat, expected) - } -} - func TestValidatedStringLogicalTypeInRecordEncode(t *testing.T) { schema := `{ "type": "record", From 38ca4e1162296155c066f84ff33f395816352a24 Mon Sep 17 00:00:00 2001 From: Patrick Assuied Date: Sun, 4 Jan 2026 11:02:25 -0800 Subject: [PATCH 5/5] added tests for legacy --- logical_type_test.go | 92 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/logical_type_test.go b/logical_type_test.go index 60f16a5..7170ccb 100644 --- a/logical_type_test.go +++ b/logical_type_test.go @@ -282,6 +282,98 @@ func TestDecimalFixedSpecCompliantTextualRoundTrip(t *testing.T) { } } +func TestDecimalBytesLegacyTextualRoundTrip(t *testing.T) { + // Test legacy (default) textual encoding with escaped bytes format + schema := `{"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}` + + // Create codec with default options (legacy encoding) + codec, err := NewCodec(schema) + if err != nil { + t.Fatal(err) + } + + testCases := []struct { + native *big.Rat + expected string // escaped bytes format + }{ + {big.NewRat(4020, 100), `"\u000F\u00B4"`}, // 4020 = 0x0FB4 + {big.NewRat(1234, 100), `"\u0004\u00D2"`}, // 1234 = 0x04D2 + {big.NewRat(-1234, 100), `"\u00FB."`}, // -1234 in two's complement + {big.NewRat(0, 1), `"\u0000"`}, // 0 + {big.NewRat(9999, 100), `"'\u000F"`}, // 9999 = 0x270F + } + + for _, tc := range testCases { + // Encode native to textual + textual, err := codec.TextualFromNative(nil, tc.native) + if err != nil { + t.Fatalf("TextualFromNative(%v): %v", tc.native, err) + } + + if string(textual) != tc.expected { + t.Errorf("TextualFromNative(%v): got %s, want %s", tc.native, textual, tc.expected) + } + + // Decode textual back to native + native, _, err := codec.NativeFromTextual(textual) + if err != nil { + t.Fatalf("NativeFromTextual(%s): %v", textual, err) + } + + rat, ok := native.(*big.Rat) + if !ok { + t.Fatalf("NativeFromTextual(%s): expected *big.Rat, got %T", textual, native) + } + + if rat.Cmp(tc.native) != 0 { + t.Errorf("NativeFromTextual(%s): got %v, want %v", textual, rat, tc.native) + } + } +} + +func TestDecimalFixedLegacyTextualRoundTrip(t *testing.T) { + // Test legacy (default) textual encoding with escaped bytes format for fixed type + schema := `{"type": "fixed", "size": 12, "logicalType": "decimal", "precision": 4, "scale": 2}` + + // Create codec with default options (legacy encoding) + codec, err := NewCodec(schema) + if err != nil { + t.Fatal(err) + } + + testCases := []struct { + native *big.Rat + }{ + {big.NewRat(4020, 100)}, + {big.NewRat(1234, 100)}, + {big.NewRat(-1234, 100)}, + // Note: 0 is not tested here due to fixed size constraints + } + + for _, tc := range testCases { + // Encode native to textual + textual, err := codec.TextualFromNative(nil, tc.native) + if err != nil { + t.Fatalf("TextualFromNative(%v): %v", tc.native, err) + } + + // Decode textual back to native - should round-trip correctly + native, _, err := codec.NativeFromTextual(textual) + if err != nil { + t.Fatalf("NativeFromTextual(%s): %v", textual, err) + } + + rat, ok := native.(*big.Rat) + if !ok { + t.Fatalf("NativeFromTextual(%s): expected *big.Rat, got %T", textual, native) + } + + if rat.Cmp(tc.native) != 0 { + t.Errorf("Round-trip failed for %v: got %v", tc.native, rat) + } + } +} + func TestDecimalBytesCorrectBinaryEncoding(t *testing.T) { // Test that binary encoding uses two's complement (same for both legacy and spec-compliant) schema := `{"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}`