malor · Kuinox · Nov 19, 2025
diff --git a/cpython_lldb.py b/cpython_lldb.py
@@ -181,22 +181,64 @@ class PyUnicodeObject(PyObject):
     U_2BYTE_KIND = 2
     U_4BYTE_KIND = 4
 
+    def _byte_order_suffix(self):
+        order = self.target.GetByteOrder()
+        if order == lldb.eByteOrderBig:
+            return "be"
+        else:
+            # treat PDP/endian-less targets as little endian, which matches
+            # the architectures CPython officially supports today
+            return "le"
+
+    def _encoding_for_unit_size(self, unit_size):
+        if unit_size == 1:
+            return "latin-1"
+        elif unit_size == 2:
+            return "utf-16-{}".format(self._byte_order_suffix())
+        elif unit_size == 4:
+            return "utf-32-{}".format(self._byte_order_suffix())
+        raise ValueError("Unsupported code unit size: {}".format(unit_size))
+
+    def _code_unit_size(self, kind):
+        if kind == self.U_1BYTE_KIND:
+            return 1
+        elif kind == self.U_2BYTE_KIND:
+            return 2
+        elif kind == self.U_4BYTE_KIND:
+            return 4
+        raise ValueError("Unsupported PyUnicodeObject kind: {}".format(kind))
+
+    def _decode_from_address(self, addr, length, unit_size, encoding=None):
+        if not addr or not length:
+            return ""
+
+        error = lldb.SBError()
+        data = self.process.ReadMemory(addr, length * unit_size, error)
+        if error.Fail():
+            raise RuntimeError(
+                "Failed to read unicode contents at 0x{:x}: {}".format(
+                    addr, error.GetCString()
+                )
+            )
+
+        encoding = encoding or self._encoding_for_unit_size(unit_size)
+        return data.decode(encoding)
+
     @property
     def value(self):
         str_type = self.target.FindFirstType(self.cpython_struct)
 
-        value = self.deref.Cast(str_type)
-        state = (
-            value.GetChildMemberWithName("_base")
+        unicode_value = self.deref.Cast(str_type)
+        ascii_base = (
+            unicode_value.GetChildMemberWithName("_base")
             .GetChildMemberWithName("_base")
-            .GetChildMemberWithName("state")
-        )
-        length = (
-            value.GetChildMemberWithName("_base")
-            .GetChildMemberWithName("_base")
-            .GetChildMemberWithName("length")
-            .unsigned
         )
+        compact_base = unicode_value.GetChildMemberWithName("_base")
+        state = ascii_base.GetChildMemberWithName("state")
+        length = ascii_base.GetChildMemberWithName("length").unsigned
+        wstr_length = compact_base.GetChildMemberWithName("wstr_length").unsigned
+        if not length and wstr_length:
+            length = wstr_length
         if not length:
             return ""
 
@@ -208,29 +250,34 @@ def value(self):
         if is_ascii and compact and ready:
             # content is stored right after the data structure in memory
             ascii_type = self.target.FindFirstType("PyASCIIObject")
-            value = value.Cast(ascii_type)
-            addr = int(value.location, 16) + value.size
+            ascii_value = unicode_value.Cast(ascii_type)
+            addr = int(ascii_value.location, 16) + ascii_value.size
 
-            rv = self.process.ReadMemory(addr, length, lldb.SBError())
-            return rv.decode("ascii")
+            return self._decode_from_address(addr, length, 1, encoding="ascii")
         elif compact and ready:
             # content is stored right after the data structure in memory
             compact_type = self.target.FindFirstType("PyCompactUnicodeObject")
-            value = value.Cast(compact_type)
-            addr = int(value.location, 16) + value.size
-
-            rv = self.process.ReadMemory(addr, length * kind, lldb.SBError())
-            if kind == self.U_1BYTE_KIND:
-                return rv.decode("latin-1")
-            elif kind == self.U_2BYTE_KIND:
-                return rv.decode("utf-16")
-            elif kind == self.U_4BYTE_KIND:
-                return rv.decode("utf-32")
-            else:
-                raise ValueError("Unsupported PyUnicodeObject kind: {}".format(kind))
+            compact_value = unicode_value.Cast(compact_type)
+            addr = int(compact_value.location, 16) + compact_value.size
+            unit_size = self._code_unit_size(kind)
+
+            return self._decode_from_address(addr, length, unit_size)
+        elif ready:
+            data_field = unicode_value.GetChildMemberWithName("data")
+            data_ptr = data_field.GetChildMemberWithName("any")
+            addr = data_ptr.unsigned
+            unit_size = self._code_unit_size(kind)
+
+            return self._decode_from_address(addr, length, unit_size)
         else:
-            # TODO: add support for legacy unicode strings
-            raise ValueError("Unsupported PyUnicodeObject kind: {}".format(kind))
+            # legacy unicode strings that only have wstr filled in
+            wstr = ascii_base.GetChildMemberWithName("wstr")
+            addr = wstr.unsigned
+            wchar_size = self.target.FindFirstType("wchar_t").size
+            if not wchar_size:
+                raise ValueError("Unsupported wchar_t size: {}".format(wchar_size))
+
+            return self._decode_from_address(addr, length, wchar_size)
 
 
 class PyNoneObject(PyObject):

diff --git a/tests/test_pretty_printer.py b/tests/test_pretty_printer.py
@@ -5,16 +5,74 @@
 from .conftest import run_lldb
 
 
-def lldb_repr_from_frame(lldb_manager, value):
+LEGACY_UNICODE_HELPERS = textwrap.dedent(
+    """
+    import ctypes
+
+    Py_ssize_t = ctypes.c_ssize_t
+
+    class _PyASCIIObject(ctypes.Structure):
+        _fields_ = [
+            ("ob_refcnt", Py_ssize_t),
+            ("ob_type", ctypes.c_void_p),
+            ("length", Py_ssize_t),
+            ("hash", Py_ssize_t),
+            ("state", ctypes.c_uint),
+            ("wstr", ctypes.c_void_p),
+        ]
+
+    class _PyCompactUnicodeObject(ctypes.Structure):
+        _fields_ = [
+            ("_base", _PyASCIIObject),
+            ("utf8_length", Py_ssize_t),
+            ("utf8", ctypes.c_void_p),
+            ("wstr_length", Py_ssize_t),
+        ]
+
+    _PyUnicode_FromUnicode = ctypes.pythonapi.PyUnicode_FromUnicode
+    _PyUnicode_FromUnicode.argtypes = [ctypes.c_void_p, Py_ssize_t]
+    _PyUnicode_FromUnicode.restype = ctypes.py_object
+
+    _PyUnicode_Ready = ctypes.pythonapi._PyUnicode_Ready
+    _PyUnicode_Ready.argtypes = [ctypes.py_object]
+    _PyUnicode_Ready.restype = ctypes.c_int
+
+    def make_legacy_string(text, ready=False):
+        size = len(text)
+        legacy = _PyUnicode_FromUnicode(ctypes.c_void_p(), size)
+        header = _PyCompactUnicodeObject.from_address(id(legacy))
+        buffer_type = ctypes.c_wchar * (size + 1)
+        buffer = buffer_type.from_address(header._base.wstr)
+        for index, char in enumerate(text):
+            buffer[index] = char
+        buffer[size] = '\\0'
+        if ready:
+            _PyUnicode_Ready(legacy)
+        return legacy
+    """
+)
+
+
+def legacy_unicode_setup(text_literal, ready):
+    return LEGACY_UNICODE_HELPERS + textwrap.dedent(
+        """
+        legacy_value = make_legacy_string({text_literal}, ready={ready})
+        """
+    ).format(text_literal=text_literal, ready="True" if ready else "False")
+
+
+def lldb_repr_from_frame(lldb_manager, value, setup_code=""):
     # Set a breakpoint in the implementation of a function that is conveniently
     # called with a single argument `v`, whose representation we are trying to
     # scrape from the LLDB output. When the breakpoint is hit, the argument
     # value will be pretty-printed by `frame info` command.
+    setup_block = textwrap.dedent(setup_code) if setup_code else ""
     code = f"""
         from collections import *
         from six.moves import *
 
         import test_extension
+        {setup_block}
         test_extension.identity({value})
     """
     response = run_lldb(
@@ -30,9 +88,9 @@ def lldb_repr_from_frame(lldb_manager, value):
     return match
 
 
-def assert_lldb_repr(lldb_manager, value, expected, code_value=None):
+def assert_lldb_repr(lldb_manager, value, expected, code_value=None, setup_code=""):
     value_repr = code_value or repr(value)
-    match = lldb_repr_from_frame(lldb_manager, value_repr)
+    match = lldb_repr_from_frame(lldb_manager, value_repr, setup_code=setup_code)
     assert match is not None
 
     if isinstance(value, (set, frozenset, dict)):
@@ -101,6 +159,31 @@ def test_str(lldb):
     assert_lldb_repr(lldb, "æ", "(u'\\\\xe6')|('æ')")
 
 
+def test_str_legacy_not_ready(lldb):
+    legacy_value = "\u041f\u0440\u0438\u0432\u0435\u0442"
+    setup_code = legacy_unicode_setup('"\\u041f\\u0440\\u0438\\u0432\\u0435\\u0442"', ready=False)
+    assert_lldb_repr(
+        lldb,
+        legacy_value,
+        "(u'\u041f\u0440\u0438\u0432\u0435\u0442')|('ÐÑÐ¸Ð²ÐµÑ')",
+        code_value="legacy_value",
+        setup_code=setup_code,
+    )
+
+
+def test_str_legacy_ready_non_compact(lldb):
+    legacy_value = "A\u2665B"
+    setup_code = legacy_unicode_setup('"A\\u2665B"', ready=True)
+    assert_lldb_repr(
+        lldb,
+        legacy_value,
+        "(u'A\u2665B')|('Aâ¥B')",
+        code_value="legacy_value",
+        setup_code=setup_code,
+    )
+
+
+
 def test_list(lldb):
     assert_lldb_repr(lldb, [], r"\[\]")
     assert_lldb_repr(lldb, [1, 2, 3], r"\[1, 2, 3\]")