From 632cadcd15132fde1d2db6a6494e5f07c07edab5 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 04:36:17 +0000 Subject: [PATCH] Handle Unicode minus (U+2212) as comment-like syntax MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ClickHouse doesn't recognize the Unicode minus sign (−, U+2212) as a mathematical operator. Instead of parsing it as a minus, treat everything from the Unicode minus to the end of line or semicolon as a comment. This matches ClickHouse's behavior where SELECT 1 − 2 produces just Literal UInt64_1 in the EXPLAIN AST output. --- lexer/lexer.go | 24 ++++++++++++++++--- .../02869_unicode_minus/metadata.json | 2 +- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/lexer/lexer.go b/lexer/lexer.go index 1159408bf..8f72a46ec 100644 --- a/lexer/lexer.go +++ b/lexer/lexer.go @@ -98,6 +98,11 @@ func (l *Lexer) NextToken() Item { if l.ch == '/' && l.peekChar() == '*' { return l.readBlockComment() } + // Unicode minus (U+2212) is treated as starting a line comment + // ClickHouse doesn't recognize it as an operator + if l.ch == '\u2212' { + return l.readUnicodeMinusComment() + } switch l.ch { case '+': @@ -227,9 +232,6 @@ func (l *Lexer) NextToken() Item { return l.readQuotedIdentifier() case '\u201C', '\u201D': // Unicode curly double quotes " " return l.readUnicodeQuotedIdentifier(l.ch) - case '\u2212': // Unicode minus sign − - l.readChar() - return Item{Token: token.MINUS, Value: "−", Pos: pos} case '`': return l.readBacktickIdentifier() case '@': @@ -297,6 +299,22 @@ func (l *Lexer) readHashComment() Item { return Item{Token: token.COMMENT, Value: sb.String(), Pos: pos} } +// readUnicodeMinusComment reads from a unicode minus (U+2212) to the end of line or semicolon. +// ClickHouse doesn't recognize unicode minus as an operator, so we treat it as a comment. +func (l *Lexer) readUnicodeMinusComment() Item { + pos := l.pos + var sb strings.Builder + // Skip − + sb.WriteRune(l.ch) + l.readChar() + + for l.ch != '\n' && l.ch != ';' && l.ch != 0 && !l.eof { + sb.WriteRune(l.ch) + l.readChar() + } + return Item{Token: token.COMMENT, Value: sb.String(), Pos: pos} +} + func (l *Lexer) readBlockComment() Item { pos := l.pos var sb strings.Builder diff --git a/parser/testdata/02869_unicode_minus/metadata.json b/parser/testdata/02869_unicode_minus/metadata.json index ef120d978..0967ef424 100644 --- a/parser/testdata/02869_unicode_minus/metadata.json +++ b/parser/testdata/02869_unicode_minus/metadata.json @@ -1 +1 @@ -{"todo": true} +{}