diff --git a/cpython_lldb.py b/cpython_lldb.py index 92b35e1..2754562 100644 --- a/cpython_lldb.py +++ b/cpython_lldb.py @@ -181,22 +181,64 @@ class PyUnicodeObject(PyObject): U_2BYTE_KIND = 2 U_4BYTE_KIND = 4 + def _byte_order_suffix(self): + order = self.target.GetByteOrder() + if order == lldb.eByteOrderBig: + return "be" + else: + # treat PDP/endian-less targets as little endian, which matches + # the architectures CPython officially supports today + return "le" + + def _encoding_for_unit_size(self, unit_size): + if unit_size == 1: + return "latin-1" + elif unit_size == 2: + return "utf-16-{}".format(self._byte_order_suffix()) + elif unit_size == 4: + return "utf-32-{}".format(self._byte_order_suffix()) + raise ValueError("Unsupported code unit size: {}".format(unit_size)) + + def _code_unit_size(self, kind): + if kind == self.U_1BYTE_KIND: + return 1 + elif kind == self.U_2BYTE_KIND: + return 2 + elif kind == self.U_4BYTE_KIND: + return 4 + raise ValueError("Unsupported PyUnicodeObject kind: {}".format(kind)) + + def _decode_from_address(self, addr, length, unit_size, encoding=None): + if not addr or not length: + return "" + + error = lldb.SBError() + data = self.process.ReadMemory(addr, length * unit_size, error) + if error.Fail(): + raise RuntimeError( + "Failed to read unicode contents at 0x{:x}: {}".format( + addr, error.GetCString() + ) + ) + + encoding = encoding or self._encoding_for_unit_size(unit_size) + return data.decode(encoding) + @property def value(self): str_type = self.target.FindFirstType(self.cpython_struct) - value = self.deref.Cast(str_type) - state = ( - value.GetChildMemberWithName("_base") + unicode_value = self.deref.Cast(str_type) + ascii_base = ( + unicode_value.GetChildMemberWithName("_base") .GetChildMemberWithName("_base") - .GetChildMemberWithName("state") - ) - length = ( - value.GetChildMemberWithName("_base") - .GetChildMemberWithName("_base") - .GetChildMemberWithName("length") - .unsigned ) + compact_base = unicode_value.GetChildMemberWithName("_base") + state = ascii_base.GetChildMemberWithName("state") + length = ascii_base.GetChildMemberWithName("length").unsigned + wstr_length = compact_base.GetChildMemberWithName("wstr_length").unsigned + if not length and wstr_length: + length = wstr_length if not length: return "" @@ -208,29 +250,34 @@ def value(self): if is_ascii and compact and ready: # content is stored right after the data structure in memory ascii_type = self.target.FindFirstType("PyASCIIObject") - value = value.Cast(ascii_type) - addr = int(value.location, 16) + value.size + ascii_value = unicode_value.Cast(ascii_type) + addr = int(ascii_value.location, 16) + ascii_value.size - rv = self.process.ReadMemory(addr, length, lldb.SBError()) - return rv.decode("ascii") + return self._decode_from_address(addr, length, 1, encoding="ascii") elif compact and ready: # content is stored right after the data structure in memory compact_type = self.target.FindFirstType("PyCompactUnicodeObject") - value = value.Cast(compact_type) - addr = int(value.location, 16) + value.size - - rv = self.process.ReadMemory(addr, length * kind, lldb.SBError()) - if kind == self.U_1BYTE_KIND: - return rv.decode("latin-1") - elif kind == self.U_2BYTE_KIND: - return rv.decode("utf-16") - elif kind == self.U_4BYTE_KIND: - return rv.decode("utf-32") - else: - raise ValueError("Unsupported PyUnicodeObject kind: {}".format(kind)) + compact_value = unicode_value.Cast(compact_type) + addr = int(compact_value.location, 16) + compact_value.size + unit_size = self._code_unit_size(kind) + + return self._decode_from_address(addr, length, unit_size) + elif ready: + data_field = unicode_value.GetChildMemberWithName("data") + data_ptr = data_field.GetChildMemberWithName("any") + addr = data_ptr.unsigned + unit_size = self._code_unit_size(kind) + + return self._decode_from_address(addr, length, unit_size) else: - # TODO: add support for legacy unicode strings - raise ValueError("Unsupported PyUnicodeObject kind: {}".format(kind)) + # legacy unicode strings that only have wstr filled in + wstr = ascii_base.GetChildMemberWithName("wstr") + addr = wstr.unsigned + wchar_size = self.target.FindFirstType("wchar_t").size + if not wchar_size: + raise ValueError("Unsupported wchar_t size: {}".format(wchar_size)) + + return self._decode_from_address(addr, length, wchar_size) class PyNoneObject(PyObject): diff --git a/tests/test_pretty_printer.py b/tests/test_pretty_printer.py index 5776044..2fea1e2 100644 --- a/tests/test_pretty_printer.py +++ b/tests/test_pretty_printer.py @@ -5,16 +5,74 @@ from .conftest import run_lldb -def lldb_repr_from_frame(lldb_manager, value): +LEGACY_UNICODE_HELPERS = textwrap.dedent( + """ + import ctypes + + Py_ssize_t = ctypes.c_ssize_t + + class _PyASCIIObject(ctypes.Structure): + _fields_ = [ + ("ob_refcnt", Py_ssize_t), + ("ob_type", ctypes.c_void_p), + ("length", Py_ssize_t), + ("hash", Py_ssize_t), + ("state", ctypes.c_uint), + ("wstr", ctypes.c_void_p), + ] + + class _PyCompactUnicodeObject(ctypes.Structure): + _fields_ = [ + ("_base", _PyASCIIObject), + ("utf8_length", Py_ssize_t), + ("utf8", ctypes.c_void_p), + ("wstr_length", Py_ssize_t), + ] + + _PyUnicode_FromUnicode = ctypes.pythonapi.PyUnicode_FromUnicode + _PyUnicode_FromUnicode.argtypes = [ctypes.c_void_p, Py_ssize_t] + _PyUnicode_FromUnicode.restype = ctypes.py_object + + _PyUnicode_Ready = ctypes.pythonapi._PyUnicode_Ready + _PyUnicode_Ready.argtypes = [ctypes.py_object] + _PyUnicode_Ready.restype = ctypes.c_int + + def make_legacy_string(text, ready=False): + size = len(text) + legacy = _PyUnicode_FromUnicode(ctypes.c_void_p(), size) + header = _PyCompactUnicodeObject.from_address(id(legacy)) + buffer_type = ctypes.c_wchar * (size + 1) + buffer = buffer_type.from_address(header._base.wstr) + for index, char in enumerate(text): + buffer[index] = char + buffer[size] = '\\0' + if ready: + _PyUnicode_Ready(legacy) + return legacy + """ +) + + +def legacy_unicode_setup(text_literal, ready): + return LEGACY_UNICODE_HELPERS + textwrap.dedent( + """ + legacy_value = make_legacy_string({text_literal}, ready={ready}) + """ + ).format(text_literal=text_literal, ready="True" if ready else "False") + + +def lldb_repr_from_frame(lldb_manager, value, setup_code=""): # Set a breakpoint in the implementation of a function that is conveniently # called with a single argument `v`, whose representation we are trying to # scrape from the LLDB output. When the breakpoint is hit, the argument # value will be pretty-printed by `frame info` command. + setup_block = textwrap.dedent(setup_code) if setup_code else "" code = f""" from collections import * from six.moves import * import test_extension + {setup_block} test_extension.identity({value}) """ response = run_lldb( @@ -30,9 +88,9 @@ def lldb_repr_from_frame(lldb_manager, value): return match -def assert_lldb_repr(lldb_manager, value, expected, code_value=None): +def assert_lldb_repr(lldb_manager, value, expected, code_value=None, setup_code=""): value_repr = code_value or repr(value) - match = lldb_repr_from_frame(lldb_manager, value_repr) + match = lldb_repr_from_frame(lldb_manager, value_repr, setup_code=setup_code) assert match is not None if isinstance(value, (set, frozenset, dict)): @@ -101,6 +159,31 @@ def test_str(lldb): assert_lldb_repr(lldb, "æ", "(u'\\\\xe6')|('æ')") +def test_str_legacy_not_ready(lldb): + legacy_value = "\u041f\u0440\u0438\u0432\u0435\u0442" + setup_code = legacy_unicode_setup('"\\u041f\\u0440\\u0438\\u0432\\u0435\\u0442"', ready=False) + assert_lldb_repr( + lldb, + legacy_value, + "(u'\u041f\u0440\u0438\u0432\u0435\u0442')|('Привет')", + code_value="legacy_value", + setup_code=setup_code, + ) + + +def test_str_legacy_ready_non_compact(lldb): + legacy_value = "A\u2665B" + setup_code = legacy_unicode_setup('"A\\u2665B"', ready=True) + assert_lldb_repr( + lldb, + legacy_value, + "(u'A\u2665B')|('A♥B')", + code_value="legacy_value", + setup_code=setup_code, + ) + + + def test_list(lldb): assert_lldb_repr(lldb, [], r"\[\]") assert_lldb_repr(lldb, [1, 2, 3], r"\[1, 2, 3\]")