diff --git a/src/java_symbols.hpp b/src/java_symbols.hpp index b344a44..369cf77 100644 --- a/src/java_symbols.hpp +++ b/src/java_symbols.hpp @@ -134,9 +134,149 @@ inline static auto strict_mode = std::optional(); */ namespace java_symbols { +/*! + * Decodes a Unicode escape sequence \uXXXX to its character representation. + * Simplified version assuming valid Java source files. + * + * @param content The string content containing potential Unicode escape + * @param position The starting position to check for Unicode escape + * @param decoded_char Output parameter for the decoded character + * @param sequence_length Output parameter for the length of the Unicode sequence + * + * @return True if a valid Unicode escape sequence was found and decoded + */ +inline bool decode_unicode_escape(std::string_view content, std::ptrdiff_t position, char32_t& decoded_char, std::ptrdiff_t& sequence_length) noexcept +{ + sequence_length = 0; + + if (position + 5 >= std::ssize(content) || content[position] != '\\' || content[position + 1] != 'u') + { + return false; + } + + // Check for valid hex digits - simplified validation + auto hex_start = position + 2; + for (std::ptrdiff_t i = 0; i < 4; ++i) + { + char c = content[hex_start + i]; + if (!std::isxdigit(static_cast(c))) + { + return false; + } + } + + // Parse hex digits - simplified logic + char32_t result = 0; + for (std::ptrdiff_t i = 0; i < 4; ++i) + { + char c = content[hex_start + i]; + result = result * 16; + if (c >= '0' && c <= '9') + { + result += c - '0'; + } + else if (c >= 'A' && c <= 'F') + { + result += c - 'A' + 10; + } + else if (c >= 'a' && c <= 'f') + { + result += c - 'a' + 10; + } + } + + decoded_char = result; + sequence_length = 6; // \uXXXX + return true; +} + +/*! + * Checks if a character at the given position (potentially Unicode-escaped) matches the target character. + * Simplified version assuming valid Java source files. + * + * @param content The string content to check + * @param position The position to check + * @param target_char The character to match against + * @param actual_length Output parameter for the actual length of the character representation + * + * @return True if the character at position matches target_char + */ +inline bool char_matches_at_position(std::string_view content, std::ptrdiff_t position, char target_char, std::ptrdiff_t& actual_length) noexcept +{ + actual_length = 1; // Default for non-Unicode chars + + if (position >= std::ssize(content)) + { + return false; + } + + // First check for Unicode escape + char32_t decoded_char; + std::ptrdiff_t unicode_length; + if (decode_unicode_escape(content, position, decoded_char, unicode_length)) + { + actual_length = unicode_length; + return static_cast(decoded_char) == target_char; + } + + // Regular character check + return content[position] == target_char; +} + +/*! + * Helper function to check for Unicode-escaped comment delimiters. + * Checks for patterns like \u002F\u002F (//) or \u002F\u002A (slash-star) + */ +inline std::ptrdiff_t check_unicode_comment_start(std::string_view content, std::ptrdiff_t position) noexcept +{ + // Check for \u002F\u002F (Unicode-escaped //) + std::ptrdiff_t char1_length, char2_length; + if (char_matches_at_position(content, position, '/', char1_length) && + position + char1_length < std::ssize(content) && + char_matches_at_position(content, position + char1_length, '/', char2_length)) + { + // Single-line comment - find newline + auto comment_start = position + char1_length + char2_length; + position = content.find('\n', comment_start); + + if (position == std::ptrdiff_t(content.npos)) + { + return std::ssize(content); + } + + return position + 1; + } + // Check for \u002F\u002A (Unicode-escaped /*) + else if (char_matches_at_position(content, position, '/', char1_length) && + position + char1_length < std::ssize(content) && + char_matches_at_position(content, position + char1_length, '*', char2_length)) + { + // Multi-line comment - find */ + auto comment_start = position + char1_length + char2_length; + + // Look for end of comment (could also be Unicode-escaped) + while (comment_start < std::ssize(content)) + { + std::ptrdiff_t end_char1_length, end_char2_length; + if (char_matches_at_position(content, comment_start, '*', end_char1_length) && + comment_start + end_char1_length < std::ssize(content) && + char_matches_at_position(content, comment_start + end_char1_length, '/', end_char2_length)) + { + return comment_start + end_char1_length + end_char2_length; + } + ++comment_start; + } + + return std::ssize(content); // Unterminated comment + } + + return position; // No comment found +} + /*! * Iterates over @p content starting at @p position to find the first character * which is not part of a Java comment nor a whitespace character. + * Supports Unicode-escaped comment delimiters. * * @return The position of the first non-whitespace non-comment character or the * length of the string if none is found. @@ -153,6 +293,7 @@ inline std::ptrdiff_t ignore_whitespace_comments(std::string_view content, std:: auto result = position; + // Check for regular comment delimiters first if (auto subst = content.substr(position, 2); subst == "//") { position = content.find('\n', position + 2); @@ -175,6 +316,15 @@ inline std::ptrdiff_t ignore_whitespace_comments(std::string_view content, std:: position += 2; } + else + { + // Check for Unicode-escaped comment delimiters + auto unicode_comment_pos = check_unicode_comment_start(content, position); + if (unicode_comment_pos != position) + { + position = unicode_comment_pos; + } + } if (result == position) { @@ -190,11 +340,49 @@ inline bool is_identifier_char(char c) noexcept return c == '_' or (not std::ispunct(static_cast(c)) and not std::isspace(static_cast(c))); } +/*! + * Checks if a character at the given position (potentially Unicode-escaped) is a valid identifier character. + * + * @param content The string content to check + * @param position The position to check + * @param char_length Output parameter for the actual length of the character representation + * + * @return True if the character at position is a valid identifier character + */ +inline bool is_identifier_char_at_position(std::string_view content, std::ptrdiff_t position, std::ptrdiff_t& char_length) noexcept +{ + char_length = 1; // Default for non-Unicode chars + + if (position >= std::ssize(content)) + { + return false; + } + + // Check for Unicode escape first + char32_t decoded_char; + std::ptrdiff_t unicode_length; + if (decode_unicode_escape(content, position, decoded_char, unicode_length)) + { + char_length = unicode_length; + // For Unicode escapes, we check if the decoded character would be a valid identifier char + if (decoded_char <= 127) // ASCII range + { + return is_identifier_char(static_cast(decoded_char)); + } + // For non-ASCII Unicode characters, they are generally valid identifier chars + // unless they are punctuation or whitespace + return decoded_char != 0x0040 && decoded_char != 0x002F && decoded_char != 0x002A; // '@', '/', '*' + } + + // Regular character check + return is_identifier_char(content[position]); +} + /*! * Iterates over @p content starting at @p position to find the next uncommented * symbol and returns it and an index pointing past it. The symbol is either a * sequence of alphanumeric characters or a single non-alphanumeric character or - * an empty string if the end has been reached. + * an empty string if the end has been reached. Supports Unicode escape sequences. */ inline std::tuple next_symbol(std::string_view content, std::ptrdiff_t position = 0) noexcept { @@ -224,7 +412,7 @@ inline std::tuple next_symbol(std::string_view /*! * Iterates over @p content starting at @p position to find a string @p token * which is present in the source code neither inside a comment nor inside a - * string nor inside a character literal. + * string nor inside a character literal. Supports Unicode escape sequences. * * Special case when @p token == ')', this function counts opening and closing * parentheses and returns the first parenthesis outside. @@ -238,7 +426,7 @@ inline std::tuple next_symbol(std::string_view inline std::ptrdiff_t find_token(std::string_view content, std::string_view token, std::ptrdiff_t position = 0, bool alphanumeric = false, std::ptrdiff_t stack = 0) noexcept { - while (position + std::ssize(token) <= std::ssize(content)) + while (position < std::ssize(content)) { position = ignore_whitespace_comments(content, position); @@ -247,15 +435,53 @@ inline std::ptrdiff_t find_token(std::string_view content, std::string_view toke break; } - auto substr = content.substr(position, token.length()); - - if ((token != ")" or stack == 0) and substr == token - and not (alphanumeric and ((position > 0 and is_identifier_char(content[position - 1])) - or (position + std::ssize(token) < std::ssize(content) and (is_identifier_char(content[position + token.length()])))))) + // Check for single character tokens (like '@', '/', '*') that might be Unicode-escaped + if (token.length() == 1) { - return position; + std::ptrdiff_t char_length; + if (char_matches_at_position(content, position, token[0], char_length)) + { + bool is_valid_match = true; + + // Apply alphanumeric constraints if needed + if (alphanumeric) + { + if (position > 0 && is_identifier_char(content[position - 1])) + { + is_valid_match = false; + } + else if (position + char_length < std::ssize(content) && is_identifier_char(content[position + char_length])) + { + is_valid_match = false; + } + } + + // Special handling for parentheses stack counting + if (token == ")" && stack == 0 && is_valid_match) + { + return position; + } + else if (token != ")" && is_valid_match) + { + return position; + } + } } - else if (content[position] == '\'') + // Multi-character tokens - check regular substring match + else if (position + std::ssize(token) <= std::ssize(content)) + { + auto substr = content.substr(position, token.length()); + + if ((token != ")" or stack == 0) and substr == token + and not (alphanumeric and ((position > 0 and is_identifier_char(content[position - 1])) + or (position + std::ssize(token) < std::ssize(content) and (is_identifier_char(content[position + token.length()])))))) + { + return position; + } + } + + // Handle string and character literals, and parentheses counting + if (content[position] == '\'') { if (content.substr(position, 4) == "'\\''") { @@ -295,6 +521,16 @@ inline std::ptrdiff_t find_token(std::string_view content, std::string_view toke { ++stack; } + else + { + // Check if we're at a Unicode escape and skip it appropriately + char32_t decoded_char; + std::ptrdiff_t unicode_length; + if (decode_unicode_escape(content, position, decoded_char, unicode_length)) + { + position += unicode_length - 1; // -1 because we'll increment at the end + } + } ++position; } @@ -320,8 +556,12 @@ inline std::tuple next_annotation(std::string_vie if (position < std::ssize(content)) { + // Calculate the actual length of the @ symbol (could be Unicode-escaped) + std::ptrdiff_t at_symbol_length; + char_matches_at_position(content, position, '@', at_symbol_length); + auto symbol = std::string_view(); - std::tie(symbol, end_pos) = next_symbol(content, position + 1); + std::tie(symbol, end_pos) = next_symbol(content, position + at_symbol_length); auto new_end_pos = end_pos; while (not symbol.empty()) diff --git a/test.sh b/test.sh index 93557cd..a5f38d1 100755 --- a/test.sh +++ b/test.sh @@ -152,6 +152,11 @@ test_file "Array.java" "Array.5.java" -a -n C -n D -n E -n F test_file "Package_info.java" "Package_info.1.java" -a -n MyAnn +test_file "Unicode_escapes.java" "Unicode_escapes.1.java" -n "TestAnnotation" +test_file "Unicode_escapes.java" "Unicode_escapes.2.java" -a -n "TestAnnotation" + +test_file "Unicode_comments.java" "Unicode_comments.1.java" -a -n "TestAnnotation" + ################################################################################ # Tests for tool termination on invalid sources, result is irrelevant diff --git a/test_resources/Unicode_comments.1.java b/test_resources/Unicode_comments.1.java new file mode 100644 index 0000000..f823129 --- /dev/null +++ b/test_resources/Unicode_comments.1.java @@ -0,0 +1,12 @@ + +\u002F\u002F This is a Unicode-escaped single-line comment +\u002F\u002A This is a Unicode-escaped + multi-line comment \u002A\u002F +class Unicode_comments { + // Regular comment + /* Regular multi-line comment */ + + public void testMethod() { + String test = "Unicode comments test"; + } +} diff --git a/test_resources/Unicode_comments.java b/test_resources/Unicode_comments.java new file mode 100644 index 0000000..e5eff06 --- /dev/null +++ b/test_resources/Unicode_comments.java @@ -0,0 +1,14 @@ +import com.example.TestAnnotation; + +\u002F\u002F This is a Unicode-escaped single-line comment +@TestAnnotation +\u002F\u002A This is a Unicode-escaped + multi-line comment \u002A\u002F +class Unicode_comments { + // Regular comment + /* Regular multi-line comment */ + + public void testMethod() { + String test = "Unicode comments test"; + } +} diff --git a/test_resources/Unicode_escapes.1.java b/test_resources/Unicode_escapes.1.java new file mode 100644 index 0000000..25f8a6e --- /dev/null +++ b/test_resources/Unicode_escapes.1.java @@ -0,0 +1,16 @@ +import com.example\u002FSubPackage\u002ESecondAnnotation; + +\u0040TestAnnotation +\u0040com.example\u002FSubPackage\u002ESecondAnnotation(value = "test") +class Unicode_escapes { + // This file tests Unicode escape sequences: + // \u0040 = @ (at symbol) + // \u002F = / (forward slash) + // \u002E = . (period/dot) + // \u002A = * (asterisk) + + public void testMethod() { + // Some comment with unicode escape \u0040 + String test = "Unicode \u0040 in string"; + } +} diff --git a/test_resources/Unicode_escapes.2.java b/test_resources/Unicode_escapes.2.java new file mode 100644 index 0000000..67c7677 --- /dev/null +++ b/test_resources/Unicode_escapes.2.java @@ -0,0 +1,15 @@ +import com.example\u002FSubPackage\u002ESecondAnnotation; + +\u0040com.example\u002FSubPackage\u002ESecondAnnotation(value = "test") +class Unicode_escapes { + // This file tests Unicode escape sequences: + // \u0040 = @ (at symbol) + // \u002F = / (forward slash) + // \u002E = . (period/dot) + // \u002A = * (asterisk) + + public void testMethod() { + // Some comment with unicode escape \u0040 + String test = "Unicode \u0040 in string"; + } +} diff --git a/test_resources/Unicode_escapes.java b/test_resources/Unicode_escapes.java new file mode 100644 index 0000000..ccdf7e3 --- /dev/null +++ b/test_resources/Unicode_escapes.java @@ -0,0 +1,17 @@ +import com.example.TestAnnotation; +import com.example\u002FSubPackage\u002ESecondAnnotation; + +\u0040TestAnnotation +\u0040com.example\u002FSubPackage\u002ESecondAnnotation(value = "test") +class Unicode_escapes { + // This file tests Unicode escape sequences: + // \u0040 = @ (at symbol) + // \u002F = / (forward slash) + // \u002E = . (period/dot) + // \u002A = * (asterisk) + + public void testMethod() { + // Some comment with unicode escape \u0040 + String test = "Unicode \u0040 in string"; + } +}