Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 75 additions & 28 deletions cpython_lldb.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,22 +181,64 @@ class PyUnicodeObject(PyObject):
U_2BYTE_KIND = 2
U_4BYTE_KIND = 4

def _byte_order_suffix(self):
order = self.target.GetByteOrder()
if order == lldb.eByteOrderBig:
return "be"
else:
# treat PDP/endian-less targets as little endian, which matches
# the architectures CPython officially supports today
return "le"

def _encoding_for_unit_size(self, unit_size):
if unit_size == 1:
return "latin-1"
elif unit_size == 2:
return "utf-16-{}".format(self._byte_order_suffix())
elif unit_size == 4:
return "utf-32-{}".format(self._byte_order_suffix())
raise ValueError("Unsupported code unit size: {}".format(unit_size))

def _code_unit_size(self, kind):
if kind == self.U_1BYTE_KIND:
return 1
elif kind == self.U_2BYTE_KIND:
return 2
elif kind == self.U_4BYTE_KIND:
return 4
raise ValueError("Unsupported PyUnicodeObject kind: {}".format(kind))

def _decode_from_address(self, addr, length, unit_size, encoding=None):
if not addr or not length:
return ""

error = lldb.SBError()
data = self.process.ReadMemory(addr, length * unit_size, error)
if error.Fail():
raise RuntimeError(
"Failed to read unicode contents at 0x{:x}: {}".format(
addr, error.GetCString()
)
)

encoding = encoding or self._encoding_for_unit_size(unit_size)
return data.decode(encoding)

@property
def value(self):
str_type = self.target.FindFirstType(self.cpython_struct)

value = self.deref.Cast(str_type)
state = (
value.GetChildMemberWithName("_base")
unicode_value = self.deref.Cast(str_type)
ascii_base = (
unicode_value.GetChildMemberWithName("_base")
.GetChildMemberWithName("_base")
.GetChildMemberWithName("state")
)
length = (
value.GetChildMemberWithName("_base")
.GetChildMemberWithName("_base")
.GetChildMemberWithName("length")
.unsigned
)
compact_base = unicode_value.GetChildMemberWithName("_base")
state = ascii_base.GetChildMemberWithName("state")
length = ascii_base.GetChildMemberWithName("length").unsigned
wstr_length = compact_base.GetChildMemberWithName("wstr_length").unsigned
if not length and wstr_length:
length = wstr_length
if not length:
return ""

Expand All @@ -208,29 +250,34 @@ def value(self):
if is_ascii and compact and ready:
# content is stored right after the data structure in memory
ascii_type = self.target.FindFirstType("PyASCIIObject")
value = value.Cast(ascii_type)
addr = int(value.location, 16) + value.size
ascii_value = unicode_value.Cast(ascii_type)
addr = int(ascii_value.location, 16) + ascii_value.size

rv = self.process.ReadMemory(addr, length, lldb.SBError())
return rv.decode("ascii")
return self._decode_from_address(addr, length, 1, encoding="ascii")
elif compact and ready:
# content is stored right after the data structure in memory
compact_type = self.target.FindFirstType("PyCompactUnicodeObject")
value = value.Cast(compact_type)
addr = int(value.location, 16) + value.size

rv = self.process.ReadMemory(addr, length * kind, lldb.SBError())
if kind == self.U_1BYTE_KIND:
return rv.decode("latin-1")
elif kind == self.U_2BYTE_KIND:
return rv.decode("utf-16")
elif kind == self.U_4BYTE_KIND:
return rv.decode("utf-32")
else:
raise ValueError("Unsupported PyUnicodeObject kind: {}".format(kind))
compact_value = unicode_value.Cast(compact_type)
addr = int(compact_value.location, 16) + compact_value.size
unit_size = self._code_unit_size(kind)

return self._decode_from_address(addr, length, unit_size)
elif ready:
data_field = unicode_value.GetChildMemberWithName("data")
data_ptr = data_field.GetChildMemberWithName("any")
addr = data_ptr.unsigned
unit_size = self._code_unit_size(kind)

return self._decode_from_address(addr, length, unit_size)
else:
# TODO: add support for legacy unicode strings
raise ValueError("Unsupported PyUnicodeObject kind: {}".format(kind))
# legacy unicode strings that only have wstr filled in
wstr = ascii_base.GetChildMemberWithName("wstr")
addr = wstr.unsigned
wchar_size = self.target.FindFirstType("wchar_t").size
if not wchar_size:
raise ValueError("Unsupported wchar_t size: {}".format(wchar_size))

return self._decode_from_address(addr, length, wchar_size)


class PyNoneObject(PyObject):
Expand Down
89 changes: 86 additions & 3 deletions tests/test_pretty_printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,74 @@
from .conftest import run_lldb


def lldb_repr_from_frame(lldb_manager, value):
LEGACY_UNICODE_HELPERS = textwrap.dedent(
"""
import ctypes

Py_ssize_t = ctypes.c_ssize_t

class _PyASCIIObject(ctypes.Structure):
_fields_ = [
("ob_refcnt", Py_ssize_t),
("ob_type", ctypes.c_void_p),
("length", Py_ssize_t),
("hash", Py_ssize_t),
("state", ctypes.c_uint),
("wstr", ctypes.c_void_p),
]

class _PyCompactUnicodeObject(ctypes.Structure):
_fields_ = [
("_base", _PyASCIIObject),
("utf8_length", Py_ssize_t),
("utf8", ctypes.c_void_p),
("wstr_length", Py_ssize_t),
]

_PyUnicode_FromUnicode = ctypes.pythonapi.PyUnicode_FromUnicode
_PyUnicode_FromUnicode.argtypes = [ctypes.c_void_p, Py_ssize_t]
_PyUnicode_FromUnicode.restype = ctypes.py_object

_PyUnicode_Ready = ctypes.pythonapi._PyUnicode_Ready
_PyUnicode_Ready.argtypes = [ctypes.py_object]
_PyUnicode_Ready.restype = ctypes.c_int

def make_legacy_string(text, ready=False):
size = len(text)
legacy = _PyUnicode_FromUnicode(ctypes.c_void_p(), size)
header = _PyCompactUnicodeObject.from_address(id(legacy))
buffer_type = ctypes.c_wchar * (size + 1)
buffer = buffer_type.from_address(header._base.wstr)
for index, char in enumerate(text):
buffer[index] = char
buffer[size] = '\\0'
if ready:
_PyUnicode_Ready(legacy)
return legacy
"""
)


def legacy_unicode_setup(text_literal, ready):
return LEGACY_UNICODE_HELPERS + textwrap.dedent(
"""
legacy_value = make_legacy_string({text_literal}, ready={ready})
"""
).format(text_literal=text_literal, ready="True" if ready else "False")


def lldb_repr_from_frame(lldb_manager, value, setup_code=""):
# Set a breakpoint in the implementation of a function that is conveniently
# called with a single argument `v`, whose representation we are trying to
# scrape from the LLDB output. When the breakpoint is hit, the argument
# value will be pretty-printed by `frame info` command.
setup_block = textwrap.dedent(setup_code) if setup_code else ""
code = f"""
from collections import *
from six.moves import *

import test_extension
{setup_block}
test_extension.identity({value})
"""
response = run_lldb(
Expand All @@ -30,9 +88,9 @@ def lldb_repr_from_frame(lldb_manager, value):
return match


def assert_lldb_repr(lldb_manager, value, expected, code_value=None):
def assert_lldb_repr(lldb_manager, value, expected, code_value=None, setup_code=""):
value_repr = code_value or repr(value)
match = lldb_repr_from_frame(lldb_manager, value_repr)
match = lldb_repr_from_frame(lldb_manager, value_repr, setup_code=setup_code)
assert match is not None

if isinstance(value, (set, frozenset, dict)):
Expand Down Expand Up @@ -101,6 +159,31 @@ def test_str(lldb):
assert_lldb_repr(lldb, "æ", "(u'\\\\xe6')|('æ')")


def test_str_legacy_not_ready(lldb):
legacy_value = "\u041f\u0440\u0438\u0432\u0435\u0442"
setup_code = legacy_unicode_setup('"\\u041f\\u0440\\u0438\\u0432\\u0435\\u0442"', ready=False)
assert_lldb_repr(
lldb,
legacy_value,
"(u'\u041f\u0440\u0438\u0432\u0435\u0442')|('Привет')",
code_value="legacy_value",
setup_code=setup_code,
)


def test_str_legacy_ready_non_compact(lldb):
legacy_value = "A\u2665B"
setup_code = legacy_unicode_setup('"A\\u2665B"', ready=True)
assert_lldb_repr(
lldb,
legacy_value,
"(u'A\u2665B')|('A♥B')",
code_value="legacy_value",
setup_code=setup_code,
)



def test_list(lldb):
assert_lldb_repr(lldb, [], r"\[\]")
assert_lldb_repr(lldb, [1, 2, 3], r"\[1, 2, 3\]")
Expand Down