diff --git a/rich/cells.py b/rich/cells.py index a85462271c..977f1e16d5 100644 --- a/rich/cells.py +++ b/rich/cells.py @@ -1,7 +1,7 @@ from __future__ import annotations from functools import lru_cache -from typing import Callable +from typing import Callable, Iterator, Tuple from ._cell_widths import CELL_WIDTHS @@ -30,6 +30,186 @@ _is_single_cell_widths: Callable[[str], bool] = _SINGLE_CELLS.issuperset +_ZWJ = "\u200d" +_VS16 = "\ufe0f" +_KEYCAP = "\u20e3" + + +def _is_regional_indicator(codepoint: int) -> bool: + """Check if a codepoint is a Regional Indicator Symbol Letter (for flags).""" + return 0x1F1E6 <= codepoint <= 0x1F1FF + + +def _is_emoji_modifier(codepoint: int) -> bool: + """Check if a codepoint is an emoji modifier (skin tone).""" + return 0x1F3FB <= codepoint <= 0x1F3FF + + +def _is_tag_spec_char(codepoint: int) -> bool: + """Check if a codepoint is a tag spec character (used in some emoji sequences).""" + return 0xE0020 <= codepoint <= 0xE007E + + +def _requires_emoji_clustering(text: str) -> bool: + """Return True if we should apply emoji/grapheme clustering heuristics. + + For most text, summing widths per codepoint is sufficient (and faster). + Multi-codepoint emoji sequences require special handling to match how modern + terminals render them. + """ + for character in text: + codepoint = ord(character) + if ( + character in (_VS16, _ZWJ, _KEYCAP) + or _is_regional_indicator(codepoint) + or _is_emoji_modifier(codepoint) + or _is_tag_spec_char(codepoint) + or codepoint == 0xE007F # CANCEL TAG + ): + return True + return False + + +def _iter_cell_chunks(text: str) -> Iterator[Tuple[str, int]]: + """Yield (substring, cell_width) pairs without splitting common emoji sequences. + + This is **not** a full implementation of Unicode grapheme clustering. + It focuses on the sequences most likely to cause incorrect terminal cell + measurements if treated per codepoint, including: + + - Emoji presentation sequences (VS16, U+FE0F) + - ZWJ sequences (U+200D) + - Regional indicator pairs (flags) + - Keycap sequences (e.g. "1️⃣") + - Emoji modifier sequences (skin tones) + - Emoji tag sequences + + For these sequences we yield a single chunk with width 2. + """ + _cell_size = get_character_cell_size + length = len(text) + index = 0 + + while index < length: + start = index + character = text[index] + codepoint = ord(character) + + # Flags are composed of two regional indicator codepoints. + if _is_regional_indicator(codepoint) and index + 1 < length: + if _is_regional_indicator(ord(text[index + 1])): + index += 2 + yield text[start:index], 2 + continue + + # Keycap sequences: [0-9#*] [VS16]? U+20E3 + if character in "0123456789#*": + index += 1 + if index < length and text[index] == _VS16: + index += 1 + if index < length and text[index] == _KEYCAP: + index += 1 + yield text[start:index], 2 + continue + index = start + + # Parse a potential emoji component: base, optional variation selectors / + # combining marks (width 0), optional modifier, optional tag sequence. + index += 1 + base_width = _cell_size(character) + is_emoji = base_width == 2 + saw_vs16 = False + + # Attach VS16 / combining marks etc. (width 0) to the base to avoid splitting. + while index < length: + next_character = text[index] + if next_character == _VS16: + saw_vs16 = True + index += 1 + continue + if _cell_size(next_character) == 0 and next_character != _ZWJ: + index += 1 + continue + break + + if saw_vs16: + is_emoji = True + + # Emoji modifier (skin tone) + if is_emoji and index < length and _is_emoji_modifier(ord(text[index])): + index += 1 + while ( + index < length + and _cell_size(text[index]) == 0 + and text[index] != _ZWJ + ): + index += 1 + + # Emoji tag sequences: base + tags + CANCEL TAG + if is_emoji and index < length and _is_tag_spec_char(ord(text[index])): + while index < length and _is_tag_spec_char(ord(text[index])): + index += 1 + if index < length and ord(text[index]) == 0xE007F: # CANCEL TAG + index += 1 + + # ZWJ sequences join multiple emoji components into a single glyph. + if is_emoji and index < length and text[index] == _ZWJ: + while index < length and text[index] == _ZWJ: + index += 1 + if index >= length: + break + + # Next component base + component_base = text[index] + index += 1 + component_is_emoji = _cell_size(component_base) == 2 + component_saw_vs16 = False + + while index < length: + next_character = text[index] + if next_character == _VS16: + component_saw_vs16 = True + index += 1 + continue + if _cell_size(next_character) == 0 and next_character != _ZWJ: + index += 1 + continue + break + + if component_saw_vs16: + component_is_emoji = True + + if ( + component_is_emoji + and index < length + and _is_emoji_modifier(ord(text[index])) + ): + index += 1 + while ( + index < length + and _cell_size(text[index]) == 0 + and text[index] != _ZWJ + ): + index += 1 + + if ( + component_is_emoji + and index < length + and _is_tag_spec_char(ord(text[index])) + ): + while index < length and _is_tag_spec_char(ord(text[index])): + index += 1 + if index < length and ord(text[index]) == 0xE007F: + index += 1 + + is_emoji = is_emoji or component_is_emoji + + yield text[start:index], 2 if is_emoji else base_width + continue + + yield text[start:index], 2 if is_emoji else base_width + + @lru_cache(4096) def cached_cell_len(text: str) -> int: """Get the number of cells required to display text. @@ -45,7 +225,9 @@ def cached_cell_len(text: str) -> int: """ if _is_single_cell_widths(text): return len(text) - return sum(map(get_character_cell_size, text)) + if not _requires_emoji_clustering(text): + return sum(map(get_character_cell_size, text)) + return sum(cell_size for _chunk, cell_size in _iter_cell_chunks(text)) def cell_len(text: str, _cell_len: Callable[[str], int] = cached_cell_len) -> int: @@ -61,7 +243,9 @@ def cell_len(text: str, _cell_len: Callable[[str], int] = cached_cell_len) -> in return _cell_len(text) if _is_single_cell_widths(text): return len(text) - return sum(map(get_character_cell_size, text)) + if not _requires_emoji_clustering(text): + return sum(map(get_character_cell_size, text)) + return sum(cell_size for _chunk, cell_size in _iter_cell_chunks(text)) @lru_cache(maxsize=4096) @@ -104,28 +288,47 @@ def set_cell_size(text: str, total: int) -> str: if total <= 0: return "" - cell_size = cell_len(text) - if cell_size == total: - return text - if cell_size < total: - return text + " " * (total - cell_size) - start = 0 - end = len(text) + # Fast-path the previous behaviour when we don't need to consider multi-codepoint + # emoji sequences. + if not _requires_emoji_clustering(text): + cell_size = cell_len(text) + if cell_size == total: + return text + if cell_size < total: + return text + " " * (total - cell_size) + + start = 0 + end = len(text) + + # Binary search until we find the right size + while True: + pos = (start + end) // 2 + before = text[: pos + 1] + before_len = cell_len(before) + if before_len == total + 1 and cell_len(before[-1]) == 2: + return before[:-1] + " " + if before_len == total: + return before + if before_len > total: + end = pos + else: + start = pos + + # Cluster-aware resize, to avoid splitting multi-codepoint emoji sequences. + current_width = 0 + output: list[str] = [] + + for chunk, chunk_width in _iter_cell_chunks(text): + if current_width + chunk_width > total: + break + output.append(chunk) + current_width += chunk_width - # Binary search until we find the right size - while True: - pos = (start + end) // 2 - before = text[: pos + 1] - before_len = cell_len(before) - if before_len == total + 1 and cell_len(before[-1]) == 2: - return before[:-1] + " " - if before_len == total: - return before - if before_len > total: - end = pos - else: - start = pos + if current_width < total: + output.append(" " * (total - current_width)) + + return "".join(output) def chop_cells( @@ -142,7 +345,7 @@ def chop_cells( A list of strings such that each string in the list has cell width less than or equal to the available width. """ - _get_character_cell_size = get_character_cell_size + # Use chunk iteration so we don't split common multi-codepoint emoji sequences. lines: list[list[str]] = [[]] append_new_line = lines.append @@ -150,16 +353,15 @@ def chop_cells( total_width = 0 - for character in text: - cell_width = _get_character_cell_size(character) + for chunk, cell_width in _iter_cell_chunks(text): char_doesnt_fit = total_width + cell_width > width if char_doesnt_fit: - append_new_line([character]) + append_new_line([chunk]) append_to_last_line = lines[-1].append total_width = cell_width else: - append_to_last_line(character) + append_to_last_line(chunk) total_width += cell_width return ["".join(line) for line in lines] @@ -167,8 +369,15 @@ def chop_cells( if __name__ == "__main__": # pragma: no cover print(get_character_cell_size("😽")) - for line in chop_cells("""这是对亚洲语言支持的测试。面对模棱两可的想法,拒绝猜测的诱惑。""", 8): + for line in chop_cells( + """这是对亚洲语言支持的测试。面对模棱两可的想法,拒绝猜测的诱惑。""", 8 + ): print(line) for n in range(80, 1, -1): - print(set_cell_size("""这是对亚洲语言支持的测试。面对模棱两可的想法,拒绝猜测的诱惑。""", n) + "|") + print( + set_cell_size( + """这是对亚洲语言支持的测试。面对模棱两可的想法,拒绝猜测的诱惑。""", n + ) + + "|" + ) print("x" * n)