Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
262 changes: 251 additions & 11 deletions src/java_symbols.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,149 @@ inline static auto strict_mode = std::optional<Strict_mode>();
*/
namespace java_symbols
{
/*!
* Decodes a Unicode escape sequence \uXXXX to its character representation.
* Simplified version assuming valid Java source files.
*
* @param content The string content containing potential Unicode escape
* @param position The starting position to check for Unicode escape
* @param decoded_char Output parameter for the decoded character
* @param sequence_length Output parameter for the length of the Unicode sequence
*
* @return True if a valid Unicode escape sequence was found and decoded
*/
inline bool decode_unicode_escape(std::string_view content, std::ptrdiff_t position, char32_t& decoded_char, std::ptrdiff_t& sequence_length) noexcept
{
sequence_length = 0;

if (position + 5 >= std::ssize(content) || content[position] != '\\' || content[position + 1] != 'u')
{
return false;
}

// Check for valid hex digits - simplified validation
auto hex_start = position + 2;
for (std::ptrdiff_t i = 0; i < 4; ++i)
{
char c = content[hex_start + i];
if (!std::isxdigit(static_cast<unsigned char>(c)))
{
return false;
}
}

// Parse hex digits - simplified logic
char32_t result = 0;
for (std::ptrdiff_t i = 0; i < 4; ++i)
{
char c = content[hex_start + i];
result = result * 16;
if (c >= '0' && c <= '9')
{
result += c - '0';
}
else if (c >= 'A' && c <= 'F')
{
result += c - 'A' + 10;
}
else if (c >= 'a' && c <= 'f')
{
result += c - 'a' + 10;
}
}

decoded_char = result;
sequence_length = 6; // \uXXXX
return true;
}

/*!
* Checks if a character at the given position (potentially Unicode-escaped) matches the target character.
* Simplified version assuming valid Java source files.
*
* @param content The string content to check
* @param position The position to check
* @param target_char The character to match against
* @param actual_length Output parameter for the actual length of the character representation
*
* @return True if the character at position matches target_char
*/
inline bool char_matches_at_position(std::string_view content, std::ptrdiff_t position, char target_char, std::ptrdiff_t& actual_length) noexcept
{
actual_length = 1; // Default for non-Unicode chars

if (position >= std::ssize(content))
{
return false;
}

// First check for Unicode escape
char32_t decoded_char;
std::ptrdiff_t unicode_length;
if (decode_unicode_escape(content, position, decoded_char, unicode_length))
{
actual_length = unicode_length;
return static_cast<char>(decoded_char) == target_char;
}

// Regular character check
return content[position] == target_char;
}

/*!
* Helper function to check for Unicode-escaped comment delimiters.
* Checks for patterns like \u002F\u002F (//) or \u002F\u002A (slash-star)
*/
inline std::ptrdiff_t check_unicode_comment_start(std::string_view content, std::ptrdiff_t position) noexcept
{
// Check for \u002F\u002F (Unicode-escaped //)
std::ptrdiff_t char1_length, char2_length;
if (char_matches_at_position(content, position, '/', char1_length) &&
position + char1_length < std::ssize(content) &&
char_matches_at_position(content, position + char1_length, '/', char2_length))
{
// Single-line comment - find newline
auto comment_start = position + char1_length + char2_length;
position = content.find('\n', comment_start);

if (position == std::ptrdiff_t(content.npos))
{
return std::ssize(content);
}

return position + 1;
}
// Check for \u002F\u002A (Unicode-escaped /*)
else if (char_matches_at_position(content, position, '/', char1_length) &&
position + char1_length < std::ssize(content) &&
char_matches_at_position(content, position + char1_length, '*', char2_length))
{
// Multi-line comment - find */
auto comment_start = position + char1_length + char2_length;

// Look for end of comment (could also be Unicode-escaped)
while (comment_start < std::ssize(content))
{
std::ptrdiff_t end_char1_length, end_char2_length;
if (char_matches_at_position(content, comment_start, '*', end_char1_length) &&
comment_start + end_char1_length < std::ssize(content) &&
char_matches_at_position(content, comment_start + end_char1_length, '/', end_char2_length))
{
return comment_start + end_char1_length + end_char2_length;
}
++comment_start;
}

return std::ssize(content); // Unterminated comment
}

return position; // No comment found
}

/*!
* Iterates over @p content starting at @p position to find the first character
* which is not part of a Java comment nor a whitespace character.
* Supports Unicode-escaped comment delimiters.
*
* @return The position of the first non-whitespace non-comment character or the
* length of the string if none is found.
Expand All @@ -153,6 +293,7 @@ inline std::ptrdiff_t ignore_whitespace_comments(std::string_view content, std::

auto result = position;

// Check for regular comment delimiters first
if (auto subst = content.substr(position, 2); subst == "//")
{
position = content.find('\n', position + 2);
Expand All @@ -175,6 +316,15 @@ inline std::ptrdiff_t ignore_whitespace_comments(std::string_view content, std::

position += 2;
}
else
{
// Check for Unicode-escaped comment delimiters
auto unicode_comment_pos = check_unicode_comment_start(content, position);
if (unicode_comment_pos != position)
{
position = unicode_comment_pos;
}
}

if (result == position)
{
Expand All @@ -190,11 +340,49 @@ inline bool is_identifier_char(char c) noexcept
return c == '_' or (not std::ispunct(static_cast<unsigned char>(c)) and not std::isspace(static_cast<unsigned char>(c)));
}

/*!
* Checks if a character at the given position (potentially Unicode-escaped) is a valid identifier character.
*
* @param content The string content to check
* @param position The position to check
* @param char_length Output parameter for the actual length of the character representation
*
* @return True if the character at position is a valid identifier character
*/
inline bool is_identifier_char_at_position(std::string_view content, std::ptrdiff_t position, std::ptrdiff_t& char_length) noexcept
{
char_length = 1; // Default for non-Unicode chars

if (position >= std::ssize(content))
{
return false;
}

// Check for Unicode escape first
char32_t decoded_char;
std::ptrdiff_t unicode_length;
if (decode_unicode_escape(content, position, decoded_char, unicode_length))
{
char_length = unicode_length;
// For Unicode escapes, we check if the decoded character would be a valid identifier char
if (decoded_char <= 127) // ASCII range
{
return is_identifier_char(static_cast<char>(decoded_char));
}
// For non-ASCII Unicode characters, they are generally valid identifier chars
// unless they are punctuation or whitespace
return decoded_char != 0x0040 && decoded_char != 0x002F && decoded_char != 0x002A; // '@', '/', '*'
}

// Regular character check
return is_identifier_char(content[position]);
}

/*!
* Iterates over @p content starting at @p position to find the next uncommented
* symbol and returns it and an index pointing past it. The symbol is either a
* sequence of alphanumeric characters or a single non-alphanumeric character or
* an empty string if the end has been reached.
* an empty string if the end has been reached. Supports Unicode escape sequences.
*/
inline std::tuple<std::string_view, std::ptrdiff_t> next_symbol(std::string_view content, std::ptrdiff_t position = 0) noexcept
{
Expand Down Expand Up @@ -224,7 +412,7 @@ inline std::tuple<std::string_view, std::ptrdiff_t> next_symbol(std::string_view
/*!
* Iterates over @p content starting at @p position to find a string @p token
* which is present in the source code neither inside a comment nor inside a
* string nor inside a character literal.
* string nor inside a character literal. Supports Unicode escape sequences.
*
* Special case when @p token == ')', this function counts opening and closing
* parentheses and returns the first parenthesis outside.
Expand All @@ -238,7 +426,7 @@ inline std::tuple<std::string_view, std::ptrdiff_t> next_symbol(std::string_view
inline std::ptrdiff_t find_token(std::string_view content, std::string_view token,
std::ptrdiff_t position = 0, bool alphanumeric = false, std::ptrdiff_t stack = 0) noexcept
{
while (position + std::ssize(token) <= std::ssize(content))
while (position < std::ssize(content))
{
position = ignore_whitespace_comments(content, position);

Expand All @@ -247,15 +435,53 @@ inline std::ptrdiff_t find_token(std::string_view content, std::string_view toke
break;
}

auto substr = content.substr(position, token.length());

if ((token != ")" or stack == 0) and substr == token
and not (alphanumeric and ((position > 0 and is_identifier_char(content[position - 1]))
or (position + std::ssize(token) < std::ssize(content) and (is_identifier_char(content[position + token.length()]))))))
// Check for single character tokens (like '@', '/', '*') that might be Unicode-escaped
if (token.length() == 1)
{
return position;
std::ptrdiff_t char_length;
if (char_matches_at_position(content, position, token[0], char_length))
{
bool is_valid_match = true;

// Apply alphanumeric constraints if needed
if (alphanumeric)
{
if (position > 0 && is_identifier_char(content[position - 1]))
{
is_valid_match = false;
}
else if (position + char_length < std::ssize(content) && is_identifier_char(content[position + char_length]))
{
is_valid_match = false;
}
}

// Special handling for parentheses stack counting
if (token == ")" && stack == 0 && is_valid_match)
{
return position;
}
else if (token != ")" && is_valid_match)
{
return position;
}
}
}
else if (content[position] == '\'')
// Multi-character tokens - check regular substring match
else if (position + std::ssize(token) <= std::ssize(content))
{
auto substr = content.substr(position, token.length());

if ((token != ")" or stack == 0) and substr == token
and not (alphanumeric and ((position > 0 and is_identifier_char(content[position - 1]))
or (position + std::ssize(token) < std::ssize(content) and (is_identifier_char(content[position + token.length()]))))))
{
return position;
}
}

// Handle string and character literals, and parentheses counting
if (content[position] == '\'')
{
if (content.substr(position, 4) == "'\\''")
{
Expand Down Expand Up @@ -295,6 +521,16 @@ inline std::ptrdiff_t find_token(std::string_view content, std::string_view toke
{
++stack;
}
else
{
// Check if we're at a Unicode escape and skip it appropriately
char32_t decoded_char;
std::ptrdiff_t unicode_length;
if (decode_unicode_escape(content, position, decoded_char, unicode_length))
{
position += unicode_length - 1; // -1 because we'll increment at the end
}
}

++position;
}
Expand All @@ -320,8 +556,12 @@ inline std::tuple<std::string_view, std::string> next_annotation(std::string_vie

if (position < std::ssize(content))
{
// Calculate the actual length of the @ symbol (could be Unicode-escaped)
std::ptrdiff_t at_symbol_length;
char_matches_at_position(content, position, '@', at_symbol_length);

auto symbol = std::string_view();
std::tie(symbol, end_pos) = next_symbol(content, position + 1);
std::tie(symbol, end_pos) = next_symbol(content, position + at_symbol_length);
auto new_end_pos = end_pos;

while (not symbol.empty())
Expand Down
5 changes: 5 additions & 0 deletions test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,11 @@ test_file "Array.java" "Array.5.java" -a -n C -n D -n E -n F

test_file "Package_info.java" "Package_info.1.java" -a -n MyAnn

test_file "Unicode_escapes.java" "Unicode_escapes.1.java" -n "TestAnnotation"
test_file "Unicode_escapes.java" "Unicode_escapes.2.java" -a -n "TestAnnotation"

test_file "Unicode_comments.java" "Unicode_comments.1.java" -a -n "TestAnnotation"

################################################################################
# Tests for tool termination on invalid sources, result is irrelevant

Expand Down
12 changes: 12 additions & 0 deletions test_resources/Unicode_comments.1.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

\u002F\u002F This is a Unicode-escaped single-line comment
\u002F\u002A This is a Unicode-escaped
multi-line comment \u002A\u002F
class Unicode_comments {
// Regular comment
/* Regular multi-line comment */

public void testMethod() {
String test = "Unicode comments test";
}
}
14 changes: 14 additions & 0 deletions test_resources/Unicode_comments.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import com.example.TestAnnotation;

\u002F\u002F This is a Unicode-escaped single-line comment
@TestAnnotation
\u002F\u002A This is a Unicode-escaped
multi-line comment \u002A\u002F
class Unicode_comments {
// Regular comment
/* Regular multi-line comment */

public void testMethod() {
String test = "Unicode comments test";
}
}
16 changes: 16 additions & 0 deletions test_resources/Unicode_escapes.1.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import com.example\u002FSubPackage\u002ESecondAnnotation;

\u0040TestAnnotation
\u0040com.example\u002FSubPackage\u002ESecondAnnotation(value = "test")
class Unicode_escapes {
// This file tests Unicode escape sequences:
// \u0040 = @ (at symbol)
// \u002F = / (forward slash)
// \u002E = . (period/dot)
// \u002A = * (asterisk)

public void testMethod() {
// Some comment with unicode escape \u0040
String test = "Unicode \u0040 in string";
}
}
Loading