From 8a400e37cd82483fa6e727c41d9e9e323aa754b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mari=C3=A1n=20Kon=C4=8Dek?= Date: Tue, 9 Sep 2025 15:19:19 +0200 Subject: [PATCH 1/3] Implement Unicode literal handling Generated-by: Claude Code --- src/java_symbols.hpp | 238 ++++++++++++++++++++++++-- test.sh | 3 + test_resources/Unicode_escapes.1.java | 16 ++ test_resources/Unicode_escapes.2.java | 15 ++ test_resources/Unicode_escapes.java | 17 ++ 5 files changed, 274 insertions(+), 15 deletions(-) create mode 100644 test_resources/Unicode_escapes.1.java create mode 100644 test_resources/Unicode_escapes.2.java create mode 100644 test_resources/Unicode_escapes.java diff --git a/src/java_symbols.hpp b/src/java_symbols.hpp index b344a44..97b7e77 100644 --- a/src/java_symbols.hpp +++ b/src/java_symbols.hpp @@ -134,6 +134,93 @@ inline static auto strict_mode = std::optional(); */ namespace java_symbols { +/*! + * Decodes a Unicode escape sequence \uXXXX to its character representation. + * + * @param content The string content containing potential Unicode escape + * @param position The starting position to check for Unicode escape + * @param decoded_char Output parameter for the decoded character + * @param sequence_length Output parameter for the length of the Unicode sequence (0 if not a valid escape) + * + * @return True if a valid Unicode escape sequence was found and decoded + */ +inline bool decode_unicode_escape(std::string_view content, std::ptrdiff_t position, char32_t& decoded_char, std::ptrdiff_t& sequence_length) noexcept +{ + sequence_length = 0; + + if (position + 5 >= std::ssize(content) || content[position] != '\\' || content[position + 1] != 'u') + { + return false; + } + + // Check for valid hex digits + auto hex_start = position + 2; + for (std::ptrdiff_t i = 0; i < 4; ++i) + { + char c = content[hex_start + i]; + if (!std::isxdigit(static_cast(c))) + { + return false; + } + } + + // Parse hex digits + char32_t result = 0; + for (std::ptrdiff_t i = 0; i < 4; ++i) + { + char c = content[hex_start + i]; + result = result * 16; + if (c >= '0' && c <= '9') + { + result += c - '0'; + } + else if (c >= 'A' && c <= 'F') + { + result += c - 'A' + 10; + } + else if (c >= 'a' && c <= 'f') + { + result += c - 'a' + 10; + } + } + + decoded_char = result; + sequence_length = 6; // \uXXXX + return true; +} + +/*! + * Checks if a character at the given position (potentially Unicode-escaped) matches the target character. + * + * @param content The string content to check + * @param position The position to check + * @param target_char The character to match against + * @param actual_length Output parameter for the actual length of the character representation + * + * @return True if the character at position matches target_char + */ +inline bool char_matches_at_position(std::string_view content, std::ptrdiff_t position, char target_char, std::ptrdiff_t& actual_length) noexcept +{ + actual_length = 1; // Default for non-Unicode chars + + if (position >= std::ssize(content)) + { + return false; + } + + // First check for Unicode escape + char32_t decoded_char; + std::ptrdiff_t unicode_length; + if (decode_unicode_escape(content, position, decoded_char, unicode_length)) + { + actual_length = unicode_length; + return static_cast(decoded_char) == target_char; + } + + // Regular character check + return content[position] == target_char; +} + /*! * Iterates over @p content starting at @p position to find the first character * which is not part of a Java comment nor a whitespace character. @@ -190,11 +277,49 @@ inline bool is_identifier_char(char c) noexcept return c == '_' or (not std::ispunct(static_cast(c)) and not std::isspace(static_cast(c))); } +/*! + * Checks if a character at the given position (potentially Unicode-escaped) is a valid identifier character. + * + * @param content The string content to check + * @param position The position to check + * @param char_length Output parameter for the actual length of the character representation + * + * @return True if the character at position is a valid identifier character + */ +inline bool is_identifier_char_at_position(std::string_view content, std::ptrdiff_t position, std::ptrdiff_t& char_length) noexcept +{ + char_length = 1; // Default for non-Unicode chars + + if (position >= std::ssize(content)) + { + return false; + } + + // Check for Unicode escape first + char32_t decoded_char; + std::ptrdiff_t unicode_length; + if (decode_unicode_escape(content, position, decoded_char, unicode_length)) + { + char_length = unicode_length; + // For Unicode escapes, we check if the decoded character would be a valid identifier char + if (decoded_char <= 127) // ASCII range + { + return is_identifier_char(static_cast(decoded_char)); + } + // For non-ASCII Unicode characters, they are generally valid identifier chars + // unless they are punctuation or whitespace + return decoded_char != 0x0040 && decoded_char != 0x002F && decoded_char != 0x002A; // '@', '/', '*' + } + + // Regular character check + return is_identifier_char(content[position]); +} + /*! * Iterates over @p content starting at @p position to find the next uncommented * symbol and returns it and an index pointing past it. The symbol is either a * sequence of alphanumeric characters or a single non-alphanumeric character or - * an empty string if the end has been reached. + * an empty string if the end has been reached. Supports Unicode escape sequences. */ inline std::tuple next_symbol(std::string_view content, std::ptrdiff_t position = 0) noexcept { @@ -206,13 +331,39 @@ inline std::tuple next_symbol(std::string_view if (position < std::ssize(content)) { - symbol_length = 1; + std::ptrdiff_t char_length; - if (is_identifier_char(content[position])) + // Check if the first character is an identifier character (possibly Unicode-escaped) + if (is_identifier_char_at_position(content, position, char_length)) + { + symbol_length = char_length; + + // Continue reading identifier characters + while (position + symbol_length < std::ssize(content)) + { + std::ptrdiff_t next_char_length; + if (is_identifier_char_at_position(content, position + symbol_length, next_char_length)) + { + symbol_length += next_char_length; + } + else + { + break; + } + } + } + else { - while (position + symbol_length != std::ssize(content) and is_identifier_char(content[position + symbol_length])) + // Single non-identifier character (could be Unicode-escaped) + char32_t decoded_char; + std::ptrdiff_t unicode_length; + if (decode_unicode_escape(content, position, decoded_char, unicode_length)) + { + symbol_length = unicode_length; + } + else { - ++symbol_length; + symbol_length = 1; } } } @@ -224,7 +375,7 @@ inline std::tuple next_symbol(std::string_view /*! * Iterates over @p content starting at @p position to find a string @p token * which is present in the source code neither inside a comment nor inside a - * string nor inside a character literal. + * string nor inside a character literal. Supports Unicode escape sequences. * * Special case when @p token == ')', this function counts opening and closing * parentheses and returns the first parenthesis outside. @@ -238,7 +389,7 @@ inline std::tuple next_symbol(std::string_view inline std::ptrdiff_t find_token(std::string_view content, std::string_view token, std::ptrdiff_t position = 0, bool alphanumeric = false, std::ptrdiff_t stack = 0) noexcept { - while (position + std::ssize(token) <= std::ssize(content)) + while (position < std::ssize(content)) { position = ignore_whitespace_comments(content, position); @@ -247,15 +398,58 @@ inline std::ptrdiff_t find_token(std::string_view content, std::string_view toke break; } - auto substr = content.substr(position, token.length()); - - if ((token != ")" or stack == 0) and substr == token - and not (alphanumeric and ((position > 0 and is_identifier_char(content[position - 1])) - or (position + std::ssize(token) < std::ssize(content) and (is_identifier_char(content[position + token.length()])))))) + // Check for single character tokens (like '@', '/', '*') that might be Unicode-escaped + if (token.length() == 1) + { + std::ptrdiff_t char_length; + if (char_matches_at_position(content, position, token[0], char_length)) + { + bool is_valid_match = true; + + // Apply alphanumeric constraints if needed + if (alphanumeric) + { + std::ptrdiff_t prev_char_length; + if (position > 0 && is_identifier_char_at_position(content, position - 1, prev_char_length)) + { + is_valid_match = false; + } + else + { + std::ptrdiff_t next_char_length; + if (position + char_length < std::ssize(content) && is_identifier_char_at_position(content, position + char_length, next_char_length)) + { + is_valid_match = false; + } + } + } + + // Special handling for parentheses stack counting + if (token == ")" && stack == 0 && is_valid_match) + { + return position; + } + else if (token != ")" && is_valid_match) + { + return position; + } + } + } + // Multi-character tokens - check regular substring match + else if (position + std::ssize(token) <= std::ssize(content)) { - return position; + auto substr = content.substr(position, token.length()); + + if ((token != ")" or stack == 0) and substr == token + and not (alphanumeric and ((position > 0 and is_identifier_char(content[position - 1])) + or (position + std::ssize(token) < std::ssize(content) and (is_identifier_char(content[position + token.length()])))))) + { + return position; + } } - else if (content[position] == '\'') + + // Handle string and character literals, and parentheses counting + if (content[position] == '\'') { if (content.substr(position, 4) == "'\\''") { @@ -295,6 +489,16 @@ inline std::ptrdiff_t find_token(std::string_view content, std::string_view toke { ++stack; } + else + { + // Check if we're at a Unicode escape and skip it appropriately + char32_t decoded_char; + std::ptrdiff_t unicode_length; + if (decode_unicode_escape(content, position, decoded_char, unicode_length)) + { + position += unicode_length - 1; // -1 because we'll increment at the end of the loop + } + } ++position; } @@ -320,8 +524,12 @@ inline std::tuple next_annotation(std::string_vie if (position < std::ssize(content)) { + // Calculate the actual length of the @ symbol (could be Unicode-escaped) + std::ptrdiff_t at_symbol_length; + char_matches_at_position(content, position, '@', at_symbol_length); + auto symbol = std::string_view(); - std::tie(symbol, end_pos) = next_symbol(content, position + 1); + std::tie(symbol, end_pos) = next_symbol(content, position + at_symbol_length); auto new_end_pos = end_pos; while (not symbol.empty()) diff --git a/test.sh b/test.sh index 93557cd..3b74830 100755 --- a/test.sh +++ b/test.sh @@ -152,6 +152,9 @@ test_file "Array.java" "Array.5.java" -a -n C -n D -n E -n F test_file "Package_info.java" "Package_info.1.java" -a -n MyAnn +test_file "Unicode_escapes.java" "Unicode_escapes.1.java" -n "TestAnnotation" +test_file "Unicode_escapes.java" "Unicode_escapes.2.java" -a -n "TestAnnotation" + ################################################################################ # Tests for tool termination on invalid sources, result is irrelevant diff --git a/test_resources/Unicode_escapes.1.java b/test_resources/Unicode_escapes.1.java new file mode 100644 index 0000000..25f8a6e --- /dev/null +++ b/test_resources/Unicode_escapes.1.java @@ -0,0 +1,16 @@ +import com.example\u002FSubPackage\u002ESecondAnnotation; + +\u0040TestAnnotation +\u0040com.example\u002FSubPackage\u002ESecondAnnotation(value = "test") +class Unicode_escapes { + // This file tests Unicode escape sequences: + // \u0040 = @ (at symbol) + // \u002F = / (forward slash) + // \u002E = . (period/dot) + // \u002A = * (asterisk) + + public void testMethod() { + // Some comment with unicode escape \u0040 + String test = "Unicode \u0040 in string"; + } +} diff --git a/test_resources/Unicode_escapes.2.java b/test_resources/Unicode_escapes.2.java new file mode 100644 index 0000000..67c7677 --- /dev/null +++ b/test_resources/Unicode_escapes.2.java @@ -0,0 +1,15 @@ +import com.example\u002FSubPackage\u002ESecondAnnotation; + +\u0040com.example\u002FSubPackage\u002ESecondAnnotation(value = "test") +class Unicode_escapes { + // This file tests Unicode escape sequences: + // \u0040 = @ (at symbol) + // \u002F = / (forward slash) + // \u002E = . (period/dot) + // \u002A = * (asterisk) + + public void testMethod() { + // Some comment with unicode escape \u0040 + String test = "Unicode \u0040 in string"; + } +} diff --git a/test_resources/Unicode_escapes.java b/test_resources/Unicode_escapes.java new file mode 100644 index 0000000..ccdf7e3 --- /dev/null +++ b/test_resources/Unicode_escapes.java @@ -0,0 +1,17 @@ +import com.example.TestAnnotation; +import com.example\u002FSubPackage\u002ESecondAnnotation; + +\u0040TestAnnotation +\u0040com.example\u002FSubPackage\u002ESecondAnnotation(value = "test") +class Unicode_escapes { + // This file tests Unicode escape sequences: + // \u0040 = @ (at symbol) + // \u002F = / (forward slash) + // \u002E = . (period/dot) + // \u002A = * (asterisk) + + public void testMethod() { + // Some comment with unicode escape \u0040 + String test = "Unicode \u0040 in string"; + } +} From e9b8385b7f0dbeb7f75ebebb35811714bf3babdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mari=C3=A1n=20Kon=C4=8Dek?= Date: Tue, 9 Sep 2025 15:32:13 +0200 Subject: [PATCH 2/3] Simplify Unicode literal handling Generated-by: Claude Code --- src/java_symbols.hpp | 55 +++++++++++--------------------------------- 1 file changed, 13 insertions(+), 42 deletions(-) diff --git a/src/java_symbols.hpp b/src/java_symbols.hpp index 97b7e77..9bbe426 100644 --- a/src/java_symbols.hpp +++ b/src/java_symbols.hpp @@ -136,11 +136,12 @@ namespace java_symbols { /*! * Decodes a Unicode escape sequence \uXXXX to its character representation. + * Simplified version assuming valid Java source files. * * @param content The string content containing potential Unicode escape * @param position The starting position to check for Unicode escape * @param decoded_char Output parameter for the decoded character - * @param sequence_length Output parameter for the length of the Unicode sequence (0 if not a valid escape) + * @param sequence_length Output parameter for the length of the Unicode sequence * * @return True if a valid Unicode escape sequence was found and decoded */ @@ -153,7 +154,7 @@ inline bool decode_unicode_escape(std::string_view content, std::ptrdiff_t posit return false; } - // Check for valid hex digits + // Check for valid hex digits - simplified validation auto hex_start = position + 2; for (std::ptrdiff_t i = 0; i < 4; ++i) { @@ -164,7 +165,7 @@ inline bool decode_unicode_escape(std::string_view content, std::ptrdiff_t posit } } - // Parse hex digits + // Parse hex digits - simplified logic char32_t result = 0; for (std::ptrdiff_t i = 0; i < 4; ++i) { @@ -191,6 +192,7 @@ inline bool decode_unicode_escape(std::string_view content, std::ptrdiff_t posit /*! * Checks if a character at the given position (potentially Unicode-escaped) matches the target character. + * Simplified version assuming valid Java source files. * * @param content The string content to check * @param position The position to check @@ -331,39 +333,13 @@ inline std::tuple next_symbol(std::string_view if (position < std::ssize(content)) { - std::ptrdiff_t char_length; + symbol_length = 1; - // Check if the first character is an identifier character (possibly Unicode-escaped) - if (is_identifier_char_at_position(content, position, char_length)) - { - symbol_length = char_length; - - // Continue reading identifier characters - while (position + symbol_length < std::ssize(content)) - { - std::ptrdiff_t next_char_length; - if (is_identifier_char_at_position(content, position + symbol_length, next_char_length)) - { - symbol_length += next_char_length; - } - else - { - break; - } - } - } - else + if (is_identifier_char(content[position])) { - // Single non-identifier character (could be Unicode-escaped) - char32_t decoded_char; - std::ptrdiff_t unicode_length; - if (decode_unicode_escape(content, position, decoded_char, unicode_length)) + while (position + symbol_length != std::ssize(content) and is_identifier_char(content[position + symbol_length])) { - symbol_length = unicode_length; - } - else - { - symbol_length = 1; + ++symbol_length; } } } @@ -409,18 +385,13 @@ inline std::ptrdiff_t find_token(std::string_view content, std::string_view toke // Apply alphanumeric constraints if needed if (alphanumeric) { - std::ptrdiff_t prev_char_length; - if (position > 0 && is_identifier_char_at_position(content, position - 1, prev_char_length)) + if (position > 0 && is_identifier_char(content[position - 1])) { is_valid_match = false; } - else + else if (position + char_length < std::ssize(content) && is_identifier_char(content[position + char_length])) { - std::ptrdiff_t next_char_length; - if (position + char_length < std::ssize(content) && is_identifier_char_at_position(content, position + char_length, next_char_length)) - { - is_valid_match = false; - } + is_valid_match = false; } } @@ -496,7 +467,7 @@ inline std::ptrdiff_t find_token(std::string_view content, std::string_view toke std::ptrdiff_t unicode_length; if (decode_unicode_escape(content, position, decoded_char, unicode_length)) { - position += unicode_length - 1; // -1 because we'll increment at the end of the loop + position += unicode_length - 1; // -1 because we'll increment at the end } } From eb6f8b8193b429e59dadfe341db8613bf0cdd4cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mari=C3=A1n=20Kon=C4=8Dek?= Date: Tue, 23 Sep 2025 12:14:04 +0200 Subject: [PATCH 3/3] Add unicode handling for comments --- src/java_symbols.hpp | 61 ++++++++++++++++++++++++++ test.sh | 2 + test_resources/Unicode_comments.1.java | 12 +++++ test_resources/Unicode_comments.java | 14 ++++++ 4 files changed, 89 insertions(+) create mode 100644 test_resources/Unicode_comments.1.java create mode 100644 test_resources/Unicode_comments.java diff --git a/src/java_symbols.hpp b/src/java_symbols.hpp index 9bbe426..369cf77 100644 --- a/src/java_symbols.hpp +++ b/src/java_symbols.hpp @@ -223,9 +223,60 @@ inline bool char_matches_at_position(std::string_view content, std::ptrdiff_t po return content[position] == target_char; } +/*! + * Helper function to check for Unicode-escaped comment delimiters. + * Checks for patterns like \u002F\u002F (//) or \u002F\u002A (slash-star) + */ +inline std::ptrdiff_t check_unicode_comment_start(std::string_view content, std::ptrdiff_t position) noexcept +{ + // Check for \u002F\u002F (Unicode-escaped //) + std::ptrdiff_t char1_length, char2_length; + if (char_matches_at_position(content, position, '/', char1_length) && + position + char1_length < std::ssize(content) && + char_matches_at_position(content, position + char1_length, '/', char2_length)) + { + // Single-line comment - find newline + auto comment_start = position + char1_length + char2_length; + position = content.find('\n', comment_start); + + if (position == std::ptrdiff_t(content.npos)) + { + return std::ssize(content); + } + + return position + 1; + } + // Check for \u002F\u002A (Unicode-escaped /*) + else if (char_matches_at_position(content, position, '/', char1_length) && + position + char1_length < std::ssize(content) && + char_matches_at_position(content, position + char1_length, '*', char2_length)) + { + // Multi-line comment - find */ + auto comment_start = position + char1_length + char2_length; + + // Look for end of comment (could also be Unicode-escaped) + while (comment_start < std::ssize(content)) + { + std::ptrdiff_t end_char1_length, end_char2_length; + if (char_matches_at_position(content, comment_start, '*', end_char1_length) && + comment_start + end_char1_length < std::ssize(content) && + char_matches_at_position(content, comment_start + end_char1_length, '/', end_char2_length)) + { + return comment_start + end_char1_length + end_char2_length; + } + ++comment_start; + } + + return std::ssize(content); // Unterminated comment + } + + return position; // No comment found +} + /*! * Iterates over @p content starting at @p position to find the first character * which is not part of a Java comment nor a whitespace character. + * Supports Unicode-escaped comment delimiters. * * @return The position of the first non-whitespace non-comment character or the * length of the string if none is found. @@ -242,6 +293,7 @@ inline std::ptrdiff_t ignore_whitespace_comments(std::string_view content, std:: auto result = position; + // Check for regular comment delimiters first if (auto subst = content.substr(position, 2); subst == "//") { position = content.find('\n', position + 2); @@ -264,6 +316,15 @@ inline std::ptrdiff_t ignore_whitespace_comments(std::string_view content, std:: position += 2; } + else + { + // Check for Unicode-escaped comment delimiters + auto unicode_comment_pos = check_unicode_comment_start(content, position); + if (unicode_comment_pos != position) + { + position = unicode_comment_pos; + } + } if (result == position) { diff --git a/test.sh b/test.sh index 3b74830..a5f38d1 100755 --- a/test.sh +++ b/test.sh @@ -155,6 +155,8 @@ test_file "Package_info.java" "Package_info.1.java" -a -n MyAnn test_file "Unicode_escapes.java" "Unicode_escapes.1.java" -n "TestAnnotation" test_file "Unicode_escapes.java" "Unicode_escapes.2.java" -a -n "TestAnnotation" +test_file "Unicode_comments.java" "Unicode_comments.1.java" -a -n "TestAnnotation" + ################################################################################ # Tests for tool termination on invalid sources, result is irrelevant diff --git a/test_resources/Unicode_comments.1.java b/test_resources/Unicode_comments.1.java new file mode 100644 index 0000000..f823129 --- /dev/null +++ b/test_resources/Unicode_comments.1.java @@ -0,0 +1,12 @@ + +\u002F\u002F This is a Unicode-escaped single-line comment +\u002F\u002A This is a Unicode-escaped + multi-line comment \u002A\u002F +class Unicode_comments { + // Regular comment + /* Regular multi-line comment */ + + public void testMethod() { + String test = "Unicode comments test"; + } +} diff --git a/test_resources/Unicode_comments.java b/test_resources/Unicode_comments.java new file mode 100644 index 0000000..e5eff06 --- /dev/null +++ b/test_resources/Unicode_comments.java @@ -0,0 +1,14 @@ +import com.example.TestAnnotation; + +\u002F\u002F This is a Unicode-escaped single-line comment +@TestAnnotation +\u002F\u002A This is a Unicode-escaped + multi-line comment \u002A\u002F +class Unicode_comments { + // Regular comment + /* Regular multi-line comment */ + + public void testMethod() { + String test = "Unicode comments test"; + } +}