From ffec829c2c6ae42aca5922f034f415b9b6f10c73 Mon Sep 17 00:00:00 2001 From: Vinayak Mishra Date: Tue, 17 Feb 2026 11:49:26 +0545 Subject: [PATCH] fix: replace invalid codepoints with U+FFFD in entity parser entity() emits a literal null byte for � instead of the Unicode replacement character. The CommonMark spec (section 6.2) requires U+FFFD for codepoint 0, surrogates (0xD800-0xDFFF), and values above 0x10FFFF. Check for all three invalid codepoint categories before converting to a rune. --- inline_test.go | 8 ++++++++ parser/inline.go | 7 ++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/inline_test.go b/inline_test.go index f045201c..b5e20b69 100644 --- a/inline_test.go +++ b/inline_test.go @@ -1325,3 +1325,11 @@ func BenchmarkSmartDoubleQuotes(b *testing.B) { runMarkdown("this should be normal \"quoted\" text.\n", params) } } + +func TestEntityNullByte(t *testing.T) { + // � should produce U+FFFD per CommonMark spec section 6.2 + doTestsInlineParam(t, []string{ + "�", + "

\uFFFD

\n", + }, TestParams{}) +} diff --git a/parser/inline.go b/parser/inline.go index d526ce22..39c1d5a1 100644 --- a/parser/inline.go +++ b/parser/inline.go @@ -817,7 +817,12 @@ func entity(p *Parser, data []byte, offset int) (int, ast.Node) { codepoint, err = strconv.ParseUint(string(ent[2:len(ent)-1]), 10, 64) } if err == nil { // only if conversion was valid return here. - return end, newTextNode([]byte(string(rune(codepoint)))) + r := rune(codepoint) + // Replace invalid codepoints with U+FFFD per CommonMark spec section 6.2 + if r == 0 || (r >= 0xD800 && r <= 0xDFFF) || r > 0x10FFFF { + r = '\uFFFD' + } + return end, newTextNode([]byte(string(r))) } return end, newTextNode(ent)