From 0ce2e5c5ac1729eaff2bef136f32e1bb50bb48bb Mon Sep 17 00:00:00 2001 From: Piotr Bartman-Szwarc Date: Sat, 7 Feb 2026 22:09:00 +0100 Subject: [PATCH] add qubes_pure_sanitize_string_safe_for_display Added function replaces unacceptable characters with `_`. I considered replacing `_` with the REPLACEMENT CHARACTER (U+FFFD) but the advantage of `_` is readability and the single-byte size. The internal `validate_utf8_char` function has been renamed to `validate_utf8_char_safe_for_display`, and the internal logic has been moved to `validate_utf8_char_and_return_len`, which returns the minus length of unsafe characters. This allows more efficient replacement of multi-byte UTF chars. --- qrexec-lib/pure.h | 13 +++++++ qrexec-lib/unicode.c | 67 ++++++++++++++++++++++++++++++++----- qrexec-lib/validator-test.c | 63 ++++++++++++++++++++++++++++++++++ 3 files changed, 134 insertions(+), 9 deletions(-) diff --git a/qrexec-lib/pure.h b/qrexec-lib/pure.h index 60222c84..73134f18 100644 --- a/qrexec-lib/pure.h +++ b/qrexec-lib/pure.h @@ -199,6 +199,19 @@ QUBES_PURE_PUBLIC bool qubes_pure_string_safe_for_display(const char *untrusted_str, size_t line_length); +/** + * Implements filtering and replaces non-printable/non-safe characters with `_`. + * + * @param str Input string (read-only) + * @param result Buffer to store the sanitized output (must be max_line_length) + * @param max_line_length Maximum length of the input string to process + * @return The length of the sanitized string written to result + */ +QUBES_PURE_PUBLIC size_t +qubes_pure_sanitize_string_safe_for_display(const char *untrusted_str, + char *result, + size_t max_line_length); + /** Initialize a QubesSlice from a nul-terminated string. */ static inline struct QubesSlice qubes_pure_buffer_init_from_nul_terminated_string(const char *str) diff --git a/qrexec-lib/unicode.c b/qrexec-lib/unicode.c index a79c13ae..727c1286 100644 --- a/qrexec-lib/unicode.c +++ b/qrexec-lib/unicode.c @@ -19,8 +19,9 @@ qubes_pure_code_point_safe_for_display(uint32_t code_point) { } /* validate single UTF-8 character - * return bytes count of this character, or 0 if the character is invalid */ -static int validate_utf8_char(const uint8_t *untrusted_c) { + * return bytes count of this character, or minus bytes count if the character + is invalid or not safe to display*/ +static int validate_utf8_char_and_return_len(const uint8_t *untrusted_c) { int tails_count = 0; int total_size = 0; uint32_t code_point; @@ -66,7 +67,8 @@ static int validate_utf8_char(const uint8_t *untrusted_c) { if (*untrusted_c >= 0xA0 && *untrusted_c <= 0xBF) tails_count = 1; else - return 0; + // invalid UTF-8, skip this byte and try to parse the next one + return -1; code_point = *untrusted_c & 0x3F; break; case 0xE1 ... 0xEF: @@ -80,7 +82,8 @@ static int validate_utf8_char(const uint8_t *untrusted_c) { if (*untrusted_c >= 0x90 && *untrusted_c <= 0xBF) tails_count = 2; else - return 0; + // invalid UTF-8, skip this byte and try to parse the next one + return -1; code_point = *untrusted_c & 0x3F; break; case 0xF1 ... 0xF4: @@ -89,17 +92,24 @@ static int validate_utf8_char(const uint8_t *untrusted_c) { code_point = *untrusted_c & 0x7; break; default: - return 0; // control ASCII or invalid UTF-8 + return -1; // control ASCII or invalid UTF-8 } while (tails_count-- > 0) { untrusted_c++; if (!(*untrusted_c >= 0x80 && *untrusted_c <= 0xBF)) - return 0; + return -1; code_point = code_point << 6 | (*untrusted_c & 0x3F); } - return qubes_pure_code_point_safe_for_display(code_point) ? total_size : 0; + return qubes_pure_code_point_safe_for_display(code_point) ? total_size : -total_size; +} + +/* validate single UTF-8 character + * return bytes count of this character, or 0 if the character is invalid */ +static int validate_utf8_char_safe_for_display(const uint8_t *untrusted_c) { + int result = validate_utf8_char_and_return_len(untrusted_c); + return result > 0 ? result : 0; } // Statically assert that a statement is not reachable. @@ -209,7 +219,7 @@ static ssize_t validate_path(const uint8_t *const untrusted_name, (flags & QUBES_PURE_ALLOW_UNSAFE_CHARACTERS) != 0) { /* loop will advance past this */ } else { - int utf8_ret = validate_utf8_char((const unsigned char *)(untrusted_name + i)); + int utf8_ret = validate_utf8_char_safe_for_display((const unsigned char *)(untrusted_name + i)); if (utf8_ret > 0) { i += (size_t)(utf8_ret - 1); /* loop will do one more increment */ } else { @@ -306,7 +316,7 @@ qubes_pure_string_safe_for_display(const char *untrusted_str, size_t line_length if (untrusted_str[i] >= 0x20 && untrusted_str[i] <= 0x7E) { i++; } else { - int utf8_ret = validate_utf8_char((const uint8_t *)(untrusted_str + i)); + int utf8_ret = validate_utf8_char_safe_for_display((const uint8_t *)(untrusted_str + i)); if (utf8_ret > 0) { i += utf8_ret; } else { @@ -316,3 +326,42 @@ qubes_pure_string_safe_for_display(const char *untrusted_str, size_t line_length } while (untrusted_str[i]); return true; } + +QUBES_PURE_PUBLIC size_t +qubes_pure_sanitize_string_safe_for_display(const char *untrusted_str, + char *result, + size_t max_line_length) +{ + if (max_line_length == 0) { + return 0; + } + size_t i = 0; + size_t j = 0; + while (untrusted_str[i] && j < max_line_length - 1) { + if (untrusted_str[i] >= 0x20 && untrusted_str[i] <= 0x7E) { + // keep the valid ASCII character + result[j++] = untrusted_str[i++]; + continue; + } + int utf8_ret = validate_utf8_char_and_return_len((const uint8_t *)(untrusted_str + i)); + if (utf8_ret < 0) { + // unsafe character with length of -utf8_ret + // replace unsafe utf8 (possibly multiple bytes) with '_' + result[j++] = '_'; + i -= utf8_ret; + continue; + } + if (j + utf8_ret >= max_line_length - 1) { + // not enough space for the whole character, truncate here + break; + } + // keep the valid UTF-8 character to the result buffer + for (int k = 0; k < utf8_ret; k++) { + result[j++] = untrusted_str[i++]; + } + }; + + // Enforce null termination of the result string + result[j] = '\0'; + return j; +} diff --git a/qrexec-lib/validator-test.c b/qrexec-lib/validator-test.c index c68d1992..e88fb819 100644 --- a/qrexec-lib/validator-test.c +++ b/qrexec-lib/validator-test.c @@ -2,6 +2,7 @@ #include #include #include +#include #include "pure.h" #include @@ -89,10 +90,72 @@ static int symlink_test(const struct symlink_test symlink_checks[], size_t size) return (int)failed; } +static void test_string_sanitization(void) +{ + char buf[128]; + size_t len; + + // 1. Empty string + len = qubes_pure_sanitize_string_safe_for_display("", buf, sizeof(buf)); + assert(len == 0); + assert(buf[0] == '\0'); + + // 2. Normal ASCII + len = qubes_pure_sanitize_string_safe_for_display("Hello", buf, sizeof(buf)); + assert(len == 5); + assert(strcmp(buf, "Hello") == 0); + + // 3. Safe UTF-8 (Greek Beta: \xCE\xB2) + len = qubes_pure_sanitize_string_safe_for_display("\xCE\xB2", buf, sizeof(buf)); + assert(len == 2); + assert(strcmp(buf, "\xCE\xB2") == 0); + + // 4. Unsafe UTF-8 + // \U0001f642 is \xF0\x9F\x99\x82 (4 bytes). + len = qubes_pure_sanitize_string_safe_for_display("\xF0\x9F\x99\x82", buf, sizeof(buf)); + assert(len == 1); + assert(strcmp(buf, "_") == 0); + + // 5. Invalid UTF-8 (partial) + len = qubes_pure_sanitize_string_safe_for_display("\xE0", buf, sizeof(buf)); + assert(len == 1); + assert(strcmp(buf, "_") == 0); + + // 6. Invalid UTF-8 (bad continuation) + len = qubes_pure_sanitize_string_safe_for_display("\xE0 ", buf, sizeof(buf)); + assert(len == 2); + assert(strcmp(buf, "_ ") == 0); + + // 7. Max line length + len = qubes_pure_sanitize_string_safe_for_display("ABCD", buf, 4); + assert(len == 3); + assert(strcmp(buf, "ABC") == 0); + + // 8. Truncation in middle of UTF-8 + // needs 3+1=4, but we have 3 + len = qubes_pure_sanitize_string_safe_for_display("A\xCE\xB2", buf, 3); + assert(len == 1); + assert(strcmp(buf, "A") == 0); + + // 9. Unsafe char replacement fits + // A + Emoji, so 5 bytes but the emoji is replaced with '_' and fits + len = qubes_pure_sanitize_string_safe_for_display("A\xF0\x9F\x99\x82", buf, 3); + assert(len == 2); + assert(strcmp(buf, "A_") == 0); + + // 10. mixed valid invalid + len = qubes_pure_sanitize_string_safe_for_display("a\x80""b", buf, 10); + assert(len == 3); + assert(strcmp(buf, "a_b") == 0); +} + int main(int argc, char **argv) { (void)argc; (void)argv; + + test_string_sanitization(); + assert(qubes_pure_validate_file_name((const uint8_t *)u8"simple_safe_filename.txt")); // Directory traversal checks