QubesOS · piotrbartman · Feb 7, 2026
diff --git a/qrexec-lib/pure.h b/qrexec-lib/pure.h
@@ -199,6 +199,19 @@ QUBES_PURE_PUBLIC bool
 qubes_pure_string_safe_for_display(const char *untrusted_str,
                                    size_t line_length);
 
+/**
+ * Implements filtering and replaces non-printable/non-safe characters with `_`.
+ *
+ * @param str Input string (read-only)
+ * @param result Buffer to store the sanitized output (must be max_line_length)
+ * @param max_line_length Maximum length of the input string to process
+ * @return The length of the sanitized string written to result
+ */
+QUBES_PURE_PUBLIC size_t
+qubes_pure_sanitize_string_safe_for_display(const char *untrusted_str,
+                                            char *result,
+                                            size_t max_line_length);
+
 /** Initialize a QubesSlice from a nul-terminated string. */
 static inline struct QubesSlice
 qubes_pure_buffer_init_from_nul_terminated_string(const char *str)

diff --git a/qrexec-lib/unicode.c b/qrexec-lib/unicode.c
@@ -19,8 +19,9 @@ qubes_pure_code_point_safe_for_display(uint32_t code_point) {
 }
 
 /* validate single UTF-8 character
- * return bytes count of this character, or 0 if the character is invalid */
-static int validate_utf8_char(const uint8_t *untrusted_c) {
+ * return bytes count of this character, or minus bytes count if the character
+          is invalid or not safe to display*/
+static int validate_utf8_char_and_return_len(const uint8_t *untrusted_c) {
     int tails_count = 0;
     int total_size = 0;
     uint32_t code_point;
@@ -66,7 +67,8 @@ static int validate_utf8_char(const uint8_t *untrusted_c) {
             if (*untrusted_c >= 0xA0 && *untrusted_c <= 0xBF)
                 tails_count = 1;
             else
-                return 0;
+                // invalid UTF-8, skip this byte and try to parse the next one
+                return -1;
             code_point = *untrusted_c & 0x3F;
             break;
         case 0xE1 ... 0xEF:
@@ -80,7 +82,8 @@ static int validate_utf8_char(const uint8_t *untrusted_c) {
             if (*untrusted_c >= 0x90 && *untrusted_c <= 0xBF)
                 tails_count = 2;
             else
-                return 0;
+                // invalid UTF-8, skip this byte and try to parse the next one
+                return -1;
             code_point = *untrusted_c & 0x3F;
             break;
         case 0xF1 ... 0xF4:
@@ -89,17 +92,24 @@ static int validate_utf8_char(const uint8_t *untrusted_c) {
             code_point = *untrusted_c & 0x7;
             break;
         default:
-            return 0; // control ASCII or invalid UTF-8
+            return -1; // control ASCII or invalid UTF-8
     }
 
     while (tails_count-- > 0) {
         untrusted_c++;
         if (!(*untrusted_c >= 0x80 && *untrusted_c <= 0xBF))
-            return 0;
+            return -1;
         code_point = code_point << 6 | (*untrusted_c & 0x3F);
     }
 
-    return qubes_pure_code_point_safe_for_display(code_point) ? total_size : 0;
+    return qubes_pure_code_point_safe_for_display(code_point) ? total_size : -total_size;
+}
+
+/* validate single UTF-8 character
+ * return bytes count of this character, or 0 if the character is invalid */
+static int validate_utf8_char_safe_for_display(const uint8_t *untrusted_c) {
+      int result = validate_utf8_char_and_return_len(untrusted_c);
+      return result > 0 ? result : 0;
 }
 
 // Statically assert that a statement is not reachable.
@@ -209,7 +219,7 @@ static ssize_t validate_path(const uint8_t *const untrusted_name,
                    (flags & QUBES_PURE_ALLOW_UNSAFE_CHARACTERS) != 0) {
             /* loop will advance past this */
         } else {
-            int utf8_ret = validate_utf8_char((const unsigned char *)(untrusted_name + i));
+            int utf8_ret = validate_utf8_char_safe_for_display((const unsigned char *)(untrusted_name + i));
             if (utf8_ret > 0) {
                 i += (size_t)(utf8_ret - 1); /* loop will do one more increment */
             } else {
@@ -306,7 +316,7 @@ qubes_pure_string_safe_for_display(const char *untrusted_str, size_t line_length
         if (untrusted_str[i] >= 0x20 && untrusted_str[i] <= 0x7E) {
             i++;
         } else {
-            int utf8_ret = validate_utf8_char((const uint8_t *)(untrusted_str + i));
+            int utf8_ret = validate_utf8_char_safe_for_display((const uint8_t *)(untrusted_str + i));
             if (utf8_ret > 0) {
                 i += utf8_ret;
             } else {
@@ -316,3 +326,42 @@ qubes_pure_string_safe_for_display(const char *untrusted_str, size_t line_length
     } while (untrusted_str[i]);
     return true;
 }
+
+QUBES_PURE_PUBLIC size_t
+qubes_pure_sanitize_string_safe_for_display(const char *untrusted_str,
+                                            char *result,
+                                            size_t max_line_length)
+{
+    if (max_line_length == 0) {
+        return 0;
+    }
+    size_t i = 0;
+    size_t j = 0;
+    while (untrusted_str[i] && j < max_line_length - 1) {
+        if (untrusted_str[i] >= 0x20 && untrusted_str[i] <= 0x7E) {
+            // keep the valid ASCII character
+            result[j++] = untrusted_str[i++];
+            continue;
+        }
+        int utf8_ret = validate_utf8_char_and_return_len((const uint8_t *)(untrusted_str + i));
+        if (utf8_ret < 0) {
+            // unsafe character with length of -utf8_ret
+            // replace unsafe utf8 (possibly multiple bytes) with '_'
+            result[j++] = '_';
+            i -= utf8_ret;
+            continue;
+        }
+        if (j + utf8_ret >= max_line_length - 1) {
+            // not enough space for the whole character, truncate here
+            break;
+        }
+        // keep the valid UTF-8 character to the result buffer
+        for (int k = 0; k < utf8_ret; k++) {
+            result[j++] = untrusted_str[i++];
+        }
+    };
+
+    // Enforce null termination of the result string
+    result[j] = '\0';
+    return j;
+}
diff --git a/qrexec-lib/validator-test.c b/qrexec-lib/validator-test.c
@@ -2,6 +2,7 @@
 #include <inttypes.h>
 #include <stdlib.h>
 #include <errno.h>
+#include <string.h>
 
 #include "pure.h"
 #include <unicode/utf8.h>
@@ -89,10 +90,72 @@ static int symlink_test(const struct symlink_test symlink_checks[], size_t size)
     return (int)failed;
 }
 
+static void test_string_sanitization(void)
+{
+    char buf[128];
+    size_t len;
+
+    // 1. Empty string
+    len = qubes_pure_sanitize_string_safe_for_display("", buf, sizeof(buf));
+    assert(len == 0);
+    assert(buf[0] == '\0');
+
+    // 2. Normal ASCII
+    len = qubes_pure_sanitize_string_safe_for_display("Hello", buf, sizeof(buf));
+    assert(len == 5);
+    assert(strcmp(buf, "Hello") == 0);
+
+    // 3. Safe UTF-8 (Greek Beta: \xCE\xB2)
+    len = qubes_pure_sanitize_string_safe_for_display("\xCE\xB2", buf, sizeof(buf));
+    assert(len == 2);
+    assert(strcmp(buf, "\xCE\xB2") == 0);
+
+    // 4. Unsafe UTF-8
+    // \U0001f642 is \xF0\x9F\x99\x82 (4 bytes).
+    len = qubes_pure_sanitize_string_safe_for_display("\xF0\x9F\x99\x82", buf, sizeof(buf));
+    assert(len == 1);
+    assert(strcmp(buf, "_") == 0);
+
+    // 5. Invalid UTF-8 (partial)
+    len = qubes_pure_sanitize_string_safe_for_display("\xE0", buf, sizeof(buf));
+    assert(len == 1);
+    assert(strcmp(buf, "_") == 0);
+
+    // 6. Invalid UTF-8 (bad continuation)
+    len = qubes_pure_sanitize_string_safe_for_display("\xE0 ", buf, sizeof(buf));
+    assert(len == 2);
+    assert(strcmp(buf, "_ ") == 0);
+
+    // 7. Max line length
+    len = qubes_pure_sanitize_string_safe_for_display("ABCD", buf, 4);
+    assert(len == 3);
+    assert(strcmp(buf, "ABC") == 0);
+
+    // 8. Truncation in middle of UTF-8
+    // needs 3+1=4, but we have 3
+    len = qubes_pure_sanitize_string_safe_for_display("A\xCE\xB2", buf, 3);
+    assert(len == 1);
+    assert(strcmp(buf, "A") == 0);
+
+    // 9. Unsafe char replacement fits
+    // A + Emoji, so 5 bytes but the emoji is replaced with '_' and fits
+    len = qubes_pure_sanitize_string_safe_for_display("A\xF0\x9F\x99\x82", buf, 3);
+    assert(len == 2);
+    assert(strcmp(buf, "A_") == 0);
+
+    // 10. mixed valid invalid
+    len = qubes_pure_sanitize_string_safe_for_display("a\x80""b", buf, 10);
+    assert(len == 3);
+    assert(strcmp(buf, "a_b") == 0);
+}
+
 int main(int argc, char **argv)
 {
     (void)argc;
     (void)argv;
+
+    test_string_sanitization();
+
     assert(qubes_pure_validate_file_name((const uint8_t *)u8"simple_safe_filename.txt"));
 
     // Directory traversal checks