Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
269 changes: 239 additions & 30 deletions rich/cells.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from functools import lru_cache
from typing import Callable
from typing import Callable, Iterator, Tuple

from ._cell_widths import CELL_WIDTHS

Expand Down Expand Up @@ -30,6 +30,186 @@
_is_single_cell_widths: Callable[[str], bool] = _SINGLE_CELLS.issuperset


_ZWJ = "\u200d"
_VS16 = "\ufe0f"
_KEYCAP = "\u20e3"


def _is_regional_indicator(codepoint: int) -> bool:
"""Check if a codepoint is a Regional Indicator Symbol Letter (for flags)."""
return 0x1F1E6 <= codepoint <= 0x1F1FF


def _is_emoji_modifier(codepoint: int) -> bool:
"""Check if a codepoint is an emoji modifier (skin tone)."""
return 0x1F3FB <= codepoint <= 0x1F3FF


def _is_tag_spec_char(codepoint: int) -> bool:
"""Check if a codepoint is a tag spec character (used in some emoji sequences)."""
return 0xE0020 <= codepoint <= 0xE007E


def _requires_emoji_clustering(text: str) -> bool:
"""Return True if we should apply emoji/grapheme clustering heuristics.

For most text, summing widths per codepoint is sufficient (and faster).
Multi-codepoint emoji sequences require special handling to match how modern
terminals render them.
"""
for character in text:
codepoint = ord(character)
if (
character in (_VS16, _ZWJ, _KEYCAP)
or _is_regional_indicator(codepoint)
or _is_emoji_modifier(codepoint)
or _is_tag_spec_char(codepoint)
or codepoint == 0xE007F # CANCEL TAG
):
return True
return False


def _iter_cell_chunks(text: str) -> Iterator[Tuple[str, int]]:
"""Yield (substring, cell_width) pairs without splitting common emoji sequences.

This is **not** a full implementation of Unicode grapheme clustering.
It focuses on the sequences most likely to cause incorrect terminal cell
measurements if treated per codepoint, including:

- Emoji presentation sequences (VS16, U+FE0F)
- ZWJ sequences (U+200D)
- Regional indicator pairs (flags)
- Keycap sequences (e.g. "1️⃣")
- Emoji modifier sequences (skin tones)
- Emoji tag sequences

For these sequences we yield a single chunk with width 2.
"""
_cell_size = get_character_cell_size
length = len(text)
index = 0

while index < length:
start = index
character = text[index]
codepoint = ord(character)

# Flags are composed of two regional indicator codepoints.
if _is_regional_indicator(codepoint) and index + 1 < length:
if _is_regional_indicator(ord(text[index + 1])):
index += 2
yield text[start:index], 2
continue

# Keycap sequences: [0-9#*] [VS16]? U+20E3
if character in "0123456789#*":
index += 1
if index < length and text[index] == _VS16:
index += 1
if index < length and text[index] == _KEYCAP:
index += 1
yield text[start:index], 2
continue
index = start

# Parse a potential emoji component: base, optional variation selectors /
# combining marks (width 0), optional modifier, optional tag sequence.
index += 1
base_width = _cell_size(character)
is_emoji = base_width == 2
saw_vs16 = False

# Attach VS16 / combining marks etc. (width 0) to the base to avoid splitting.
while index < length:
next_character = text[index]
if next_character == _VS16:
saw_vs16 = True
index += 1
continue
if _cell_size(next_character) == 0 and next_character != _ZWJ:
index += 1
continue
break

if saw_vs16:
is_emoji = True

# Emoji modifier (skin tone)
if is_emoji and index < length and _is_emoji_modifier(ord(text[index])):
index += 1
while (
index < length
and _cell_size(text[index]) == 0
and text[index] != _ZWJ
):
index += 1

# Emoji tag sequences: base + tags + CANCEL TAG
if is_emoji and index < length and _is_tag_spec_char(ord(text[index])):
while index < length and _is_tag_spec_char(ord(text[index])):
index += 1
if index < length and ord(text[index]) == 0xE007F: # CANCEL TAG
index += 1

# ZWJ sequences join multiple emoji components into a single glyph.
if is_emoji and index < length and text[index] == _ZWJ:
while index < length and text[index] == _ZWJ:
index += 1
if index >= length:
break

# Next component base
component_base = text[index]
index += 1
component_is_emoji = _cell_size(component_base) == 2
component_saw_vs16 = False

while index < length:
next_character = text[index]
if next_character == _VS16:
component_saw_vs16 = True
index += 1
continue
if _cell_size(next_character) == 0 and next_character != _ZWJ:
index += 1
continue
break

if component_saw_vs16:
component_is_emoji = True

if (
component_is_emoji
and index < length
and _is_emoji_modifier(ord(text[index]))
):
index += 1
while (
index < length
and _cell_size(text[index]) == 0
and text[index] != _ZWJ
):
index += 1

if (
component_is_emoji
and index < length
and _is_tag_spec_char(ord(text[index]))
):
while index < length and _is_tag_spec_char(ord(text[index])):
index += 1
if index < length and ord(text[index]) == 0xE007F:
index += 1

is_emoji = is_emoji or component_is_emoji

yield text[start:index], 2 if is_emoji else base_width
continue

yield text[start:index], 2 if is_emoji else base_width


@lru_cache(4096)
def cached_cell_len(text: str) -> int:
"""Get the number of cells required to display text.
Expand All @@ -45,7 +225,9 @@ def cached_cell_len(text: str) -> int:
"""
if _is_single_cell_widths(text):
return len(text)
return sum(map(get_character_cell_size, text))
if not _requires_emoji_clustering(text):
return sum(map(get_character_cell_size, text))
return sum(cell_size for _chunk, cell_size in _iter_cell_chunks(text))


def cell_len(text: str, _cell_len: Callable[[str], int] = cached_cell_len) -> int:
Expand All @@ -61,7 +243,9 @@ def cell_len(text: str, _cell_len: Callable[[str], int] = cached_cell_len) -> in
return _cell_len(text)
if _is_single_cell_widths(text):
return len(text)
return sum(map(get_character_cell_size, text))
if not _requires_emoji_clustering(text):
return sum(map(get_character_cell_size, text))
return sum(cell_size for _chunk, cell_size in _iter_cell_chunks(text))


@lru_cache(maxsize=4096)
Expand Down Expand Up @@ -104,28 +288,47 @@ def set_cell_size(text: str, total: int) -> str:

if total <= 0:
return ""
cell_size = cell_len(text)
if cell_size == total:
return text
if cell_size < total:
return text + " " * (total - cell_size)

start = 0
end = len(text)
# Fast-path the previous behaviour when we don't need to consider multi-codepoint
# emoji sequences.
if not _requires_emoji_clustering(text):
cell_size = cell_len(text)
if cell_size == total:
return text
if cell_size < total:
return text + " " * (total - cell_size)

start = 0
end = len(text)

# Binary search until we find the right size
while True:
pos = (start + end) // 2
before = text[: pos + 1]
before_len = cell_len(before)
if before_len == total + 1 and cell_len(before[-1]) == 2:
return before[:-1] + " "
if before_len == total:
return before
if before_len > total:
end = pos
else:
start = pos

# Cluster-aware resize, to avoid splitting multi-codepoint emoji sequences.
current_width = 0
output: list[str] = []

for chunk, chunk_width in _iter_cell_chunks(text):
if current_width + chunk_width > total:
break
output.append(chunk)
current_width += chunk_width

# Binary search until we find the right size
while True:
pos = (start + end) // 2
before = text[: pos + 1]
before_len = cell_len(before)
if before_len == total + 1 and cell_len(before[-1]) == 2:
return before[:-1] + " "
if before_len == total:
return before
if before_len > total:
end = pos
else:
start = pos
if current_width < total:
output.append(" " * (total - current_width))

return "".join(output)


def chop_cells(
Expand All @@ -142,33 +345,39 @@ def chop_cells(
A list of strings such that each string in the list has cell width
less than or equal to the available width.
"""
_get_character_cell_size = get_character_cell_size
# Use chunk iteration so we don't split common multi-codepoint emoji sequences.
lines: list[list[str]] = [[]]

append_new_line = lines.append
append_to_last_line = lines[-1].append

total_width = 0

for character in text:
cell_width = _get_character_cell_size(character)
for chunk, cell_width in _iter_cell_chunks(text):
char_doesnt_fit = total_width + cell_width > width

if char_doesnt_fit:
append_new_line([character])
append_new_line([chunk])
append_to_last_line = lines[-1].append
total_width = cell_width
else:
append_to_last_line(character)
append_to_last_line(chunk)
total_width += cell_width

return ["".join(line) for line in lines]


if __name__ == "__main__": # pragma: no cover
print(get_character_cell_size("😽"))
for line in chop_cells("""这是对亚洲语言支持的测试。面对模棱两可的想法,拒绝猜测的诱惑。""", 8):
for line in chop_cells(
"""这是对亚洲语言支持的测试。面对模棱两可的想法,拒绝猜测的诱惑。""", 8
):
print(line)
for n in range(80, 1, -1):
print(set_cell_size("""这是对亚洲语言支持的测试。面对模棱两可的想法,拒绝猜测的诱惑。""", n) + "|")
print(
set_cell_size(
"""这是对亚洲语言支持的测试。面对模棱两可的想法,拒绝猜测的诱惑。""", n
)
+ "|"
)
print("x" * n)