fedora-java · mkoncek · Sep 9, 2025 · Sep 9, 2025 · Sep 23, 2025
diff --git a/src/java_symbols.hpp b/src/java_symbols.hpp
@@ -134,9 +134,149 @@ inline static auto strict_mode = std::optional<Strict_mode>();
  */
 namespace java_symbols
 {
+/*!
+ * Decodes a Unicode escape sequence \uXXXX to its character representation.
+ * Simplified version assuming valid Java source files.
+ * 
+ * @param content The string content containing potential Unicode escape
+ * @param position The starting position to check for Unicode escape
+ * @param decoded_char Output parameter for the decoded character
+ * @param sequence_length Output parameter for the length of the Unicode sequence
+ * 
+ * @return True if a valid Unicode escape sequence was found and decoded
+ */
+inline bool decode_unicode_escape(std::string_view content, std::ptrdiff_t position, char32_t& decoded_char, std::ptrdiff_t& sequence_length) noexcept
+{
+	sequence_length = 0;
+
+	if (position + 5 >= std::ssize(content) || content[position] != '\\' || content[position + 1] != 'u')
+	{
+		return false;
+	}
+
+	// Check for valid hex digits - simplified validation
+	auto hex_start = position + 2;
+	for (std::ptrdiff_t i = 0; i < 4; ++i)
+	{
+		char c = content[hex_start + i];
+		if (!std::isxdigit(static_cast<unsigned char>(c)))
+		{
+			return false;
+		}
+	}
+
+	// Parse hex digits - simplified logic
+	char32_t result = 0;
+	for (std::ptrdiff_t i = 0; i < 4; ++i)
+	{
+		char c = content[hex_start + i];
+		result = result * 16;
+		if (c >= '0' && c <= '9')
+		{
+			result += c - '0';
+		}
+		else if (c >= 'A' && c <= 'F')
+		{
+			result += c - 'A' + 10;
+		}
+		else if (c >= 'a' && c <= 'f')
+		{
+			result += c - 'a' + 10;
+		}
+	}
+
+	decoded_char = result;
+	sequence_length = 6; // \uXXXX
+	return true;
+}
+
+/*!
+ * Checks if a character at the given position (potentially Unicode-escaped) matches the target character.
+ * Simplified version assuming valid Java source files.
+ * 
+ * @param content The string content to check
+ * @param position The position to check
+ * @param target_char The character to match against
+ * @param actual_length Output parameter for the actual length of the character representation
+ * 
+ * @return True if the character at position matches target_char
+ */
+inline bool char_matches_at_position(std::string_view content, std::ptrdiff_t position, char target_char, std::ptrdiff_t& actual_length) noexcept
+{
+	actual_length = 1; // Default for non-Unicode chars
+
+	if (position >= std::ssize(content))
+	{
+		return false;
+	}
+
+	// First check for Unicode escape
+	char32_t decoded_char;
+	std::ptrdiff_t unicode_length;
+	if (decode_unicode_escape(content, position, decoded_char, unicode_length))
+	{
+		actual_length = unicode_length;
+		return static_cast<char>(decoded_char) == target_char;
+	}
+
+	// Regular character check
+	return content[position] == target_char;
+}
+
+/*!
+ * Helper function to check for Unicode-escaped comment delimiters.
+ * Checks for patterns like \u002F\u002F (//) or \u002F\u002A (slash-star)
+ */
+inline std::ptrdiff_t check_unicode_comment_start(std::string_view content, std::ptrdiff_t position) noexcept
+{
+	// Check for \u002F\u002F (Unicode-escaped //)
+	std::ptrdiff_t char1_length, char2_length;
+	if (char_matches_at_position(content, position, '/', char1_length) &&
+	    position + char1_length < std::ssize(content) &&
+	    char_matches_at_position(content, position + char1_length, '/', char2_length))
+	{
+		// Single-line comment - find newline
+		auto comment_start = position + char1_length + char2_length;
+		position = content.find('\n', comment_start);
+
+		if (position == std::ptrdiff_t(content.npos))
+		{
+			return std::ssize(content);
+		}
+
+		return position + 1;
+	}
+	// Check for \u002F\u002A (Unicode-escaped /*)
+	else if (char_matches_at_position(content, position, '/', char1_length) &&
+	         position + char1_length < std::ssize(content) &&
+	         char_matches_at_position(content, position + char1_length, '*', char2_length))
+	{
+		// Multi-line comment - find */
+		auto comment_start = position + char1_length + char2_length;
+
+		// Look for end of comment (could also be Unicode-escaped)
+		while (comment_start < std::ssize(content))
+		{
+			std::ptrdiff_t end_char1_length, end_char2_length;
+			if (char_matches_at_position(content, comment_start, '*', end_char1_length) &&
+			    comment_start + end_char1_length < std::ssize(content) &&
+			    char_matches_at_position(content, comment_start + end_char1_length, '/', end_char2_length))
+			{
+				return comment_start + end_char1_length + end_char2_length;
+			}
+			++comment_start;
+		}
+
+		return std::ssize(content); // Unterminated comment
+	}
+
+	return position; // No comment found
+}
+
 /*!
  * Iterates over @p content starting at @p position to find the first character
  * which is not part of a Java comment nor a whitespace character.
+ * Supports Unicode-escaped comment delimiters.
  * 
  * @return The position of the first non-whitespace non-comment character or the
  * length of the string if none is found.
@@ -153,6 +293,7 @@ inline std::ptrdiff_t ignore_whitespace_comments(std::string_view content, std::
 
 		auto result = position;
 
+		// Check for regular comment delimiters first
 		if (auto subst = content.substr(position, 2); subst == "//")
 		{
 			position = content.find('\n', position + 2);
@@ -175,6 +316,15 @@ inline std::ptrdiff_t ignore_whitespace_comments(std::string_view content, std::
 
 			position += 2;
 		}
+		else
+		{
+			// Check for Unicode-escaped comment delimiters
+			auto unicode_comment_pos = check_unicode_comment_start(content, position);
+			if (unicode_comment_pos != position)
+			{
+				position = unicode_comment_pos;
+			}
+		}
 
 		if (result == position)
 		{
@@ -190,11 +340,49 @@ inline bool is_identifier_char(char c) noexcept
 	return c == '_' or (not std::ispunct(static_cast<unsigned char>(c)) and not std::isspace(static_cast<unsigned char>(c)));
 }
 
+/*!
+ * Checks if a character at the given position (potentially Unicode-escaped) is a valid identifier character.
+ * 
+ * @param content The string content to check
+ * @param position The position to check
+ * @param char_length Output parameter for the actual length of the character representation
+ * 
+ * @return True if the character at position is a valid identifier character
+ */
+inline bool is_identifier_char_at_position(std::string_view content, std::ptrdiff_t position, std::ptrdiff_t& char_length) noexcept
+{
+	char_length = 1; // Default for non-Unicode chars
+
+	if (position >= std::ssize(content))
+	{
+		return false;
+	}
+
+	// Check for Unicode escape first
+	char32_t decoded_char;
+	std::ptrdiff_t unicode_length;
+	if (decode_unicode_escape(content, position, decoded_char, unicode_length))
+	{
+		char_length = unicode_length;
+		// For Unicode escapes, we check if the decoded character would be a valid identifier char
+		if (decoded_char <= 127) // ASCII range
+		{
+			return is_identifier_char(static_cast<char>(decoded_char));
+		}
+		// For non-ASCII Unicode characters, they are generally valid identifier chars
+		// unless they are punctuation or whitespace
+		return decoded_char != 0x0040 && decoded_char != 0x002F && decoded_char != 0x002A; // '@', '/', '*'
+	}
+
+	// Regular character check
+	return is_identifier_char(content[position]);
+}
+
 /*!
  * Iterates over @p content starting at @p position to find the next uncommented
  * symbol and returns it and an index pointing past it. The symbol is either a
  * sequence of alphanumeric characters or a single non-alphanumeric character or
- * an empty string if the end has been reached.
+ * an empty string if the end has been reached. Supports Unicode escape sequences.
  */
 inline std::tuple<std::string_view, std::ptrdiff_t> next_symbol(std::string_view content, std::ptrdiff_t position = 0) noexcept
 {
@@ -224,7 +412,7 @@ inline std::tuple<std::string_view, std::ptrdiff_t> next_symbol(std::string_view
 /*!
  * Iterates over @p content starting at @p position to find a string @p token
  * which is present in the source code neither inside a comment nor inside a
- * string nor inside a character literal.
+ * string nor inside a character literal. Supports Unicode escape sequences.
  * 
  * Special case when @p token == ')', this function counts opening and closing
  * parentheses and returns the first parenthesis outside.
@@ -238,7 +426,7 @@ inline std::tuple<std::string_view, std::ptrdiff_t> next_symbol(std::string_view
 inline std::ptrdiff_t find_token(std::string_view content, std::string_view token,
 	std::ptrdiff_t position = 0, bool alphanumeric = false, std::ptrdiff_t stack = 0) noexcept
 {
-	while (position + std::ssize(token) <= std::ssize(content))
+	while (position < std::ssize(content))
 	{
 		position = ignore_whitespace_comments(content, position);
 
@@ -247,15 +435,53 @@ inline std::ptrdiff_t find_token(std::string_view content, std::string_view toke
 			break;
 		}
 
-		auto substr = content.substr(position, token.length());
-
-		if ((token != ")" or stack == 0) and substr == token
-			and not (alphanumeric and ((position > 0 and is_identifier_char(content[position - 1]))
-				or (position + std::ssize(token) < std::ssize(content) and (is_identifier_char(content[position + token.length()]))))))
+		// Check for single character tokens (like '@', '/', '*') that might be Unicode-escaped
+		if (token.length() == 1)
 		{
-			return position;
+			std::ptrdiff_t char_length;
+			if (char_matches_at_position(content, position, token[0], char_length))
+			{
+				bool is_valid_match = true;
+
+				// Apply alphanumeric constraints if needed
+				if (alphanumeric)
+				{
+					if (position > 0 && is_identifier_char(content[position - 1]))
+					{
+						is_valid_match = false;
+					}
+					else if (position + char_length < std::ssize(content) && is_identifier_char(content[position + char_length]))
+					{
+						is_valid_match = false;
+					}
+				}
+
+				// Special handling for parentheses stack counting
+				if (token == ")" && stack == 0 && is_valid_match)
+				{
+					return position;
+				}
+				else if (token != ")" && is_valid_match)
+				{
+					return position;
+				}
+			}
 		}
-		else if (content[position] == '\'')
+		// Multi-character tokens - check regular substring match
+		else if (position + std::ssize(token) <= std::ssize(content))
+		{
+			auto substr = content.substr(position, token.length());
+
+			if ((token != ")" or stack == 0) and substr == token
+				and not (alphanumeric and ((position > 0 and is_identifier_char(content[position - 1]))
+					or (position + std::ssize(token) < std::ssize(content) and (is_identifier_char(content[position + token.length()]))))))
+			{
+				return position;
+			}
+		}
+
+		// Handle string and character literals, and parentheses counting
+		if (content[position] == '\'')
 		{
 			if (content.substr(position, 4) == "'\\''")
 			{
@@ -295,6 +521,16 @@ inline std::ptrdiff_t find_token(std::string_view content, std::string_view toke
 		{
 			++stack;
 		}
+		else
+		{
+			// Check if we're at a Unicode escape and skip it appropriately
+			char32_t decoded_char;
+			std::ptrdiff_t unicode_length;
+			if (decode_unicode_escape(content, position, decoded_char, unicode_length))
+			{
+				position += unicode_length - 1; // -1 because we'll increment at the end
+			}
+		}
 
 		++position;
 	}
@@ -320,8 +556,12 @@ inline std::tuple<std::string_view, std::string> next_annotation(std::string_vie
 
 	if (position < std::ssize(content))
 	{
+		// Calculate the actual length of the @ symbol (could be Unicode-escaped)
+		std::ptrdiff_t at_symbol_length;
+		char_matches_at_position(content, position, '@', at_symbol_length);
+
 		auto symbol = std::string_view();
-		std::tie(symbol, end_pos) = next_symbol(content, position + 1);
+		std::tie(symbol, end_pos) = next_symbol(content, position + at_symbol_length);
 		auto new_end_pos = end_pos;
 
 		while (not symbol.empty())

diff --git a/test.sh b/test.sh
@@ -152,6 +152,11 @@ test_file "Array.java" "Array.5.java" -a -n C -n D -n E -n F
 
 test_file "Package_info.java" "Package_info.1.java" -a -n MyAnn
 
+test_file "Unicode_escapes.java" "Unicode_escapes.1.java" -n "TestAnnotation"
+test_file "Unicode_escapes.java" "Unicode_escapes.2.java" -a -n "TestAnnotation"
+
+test_file "Unicode_comments.java" "Unicode_comments.1.java" -a -n "TestAnnotation"
+
 ################################################################################
 # Tests for tool termination on invalid sources, result is irrelevant
 

diff --git a/test_resources/Unicode_comments.1.java b/test_resources/Unicode_comments.1.java
@@ -0,0 +1,12 @@
+
+\u002F\u002F This is a Unicode-escaped single-line comment
+\u002F\u002A This is a Unicode-escaped 
+   multi-line comment \u002A\u002F
+class Unicode_comments {
+    // Regular comment
+    /* Regular multi-line comment */
+
+    public void testMethod() {
+        String test = "Unicode comments test";
+    }
+}
diff --git a/test_resources/Unicode_comments.java b/test_resources/Unicode_comments.java
@@ -0,0 +1,14 @@
+import com.example.TestAnnotation;
+
+\u002F\u002F This is a Unicode-escaped single-line comment
+@TestAnnotation
+\u002F\u002A This is a Unicode-escaped 
+   multi-line comment \u002A\u002F
+class Unicode_comments {
+    // Regular comment
+    /* Regular multi-line comment */
+
+    public void testMethod() {
+        String test = "Unicode comments test";
+    }
+}
diff --git a/test_resources/Unicode_escapes.1.java b/test_resources/Unicode_escapes.1.java
@@ -0,0 +1,16 @@
+import com.example\u002FSubPackage\u002ESecondAnnotation;
+
+\u0040TestAnnotation
+\u0040com.example\u002FSubPackage\u002ESecondAnnotation(value = "test")
+class Unicode_escapes {
+    // This file tests Unicode escape sequences:
+    // \u0040 = @ (at symbol)
+    // \u002F = / (forward slash)
+    // \u002E = . (period/dot)
+    // \u002A = * (asterisk)
+
+    public void testMethod() {
+        // Some comment with unicode escape \u0040
+        String test = "Unicode \u0040 in string";
+    }
+}