rigelbm · rigelbm · Feb 11, 2026
diff --git a/rich/cells.py b/rich/cells.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from functools import lru_cache
-from typing import Callable
+from typing import Callable, Iterator, Tuple
 
 from ._cell_widths import CELL_WIDTHS
 
@@ -30,6 +30,186 @@
 _is_single_cell_widths: Callable[[str], bool] = _SINGLE_CELLS.issuperset
 
 
+_ZWJ = "\u200d"
+_VS16 = "\ufe0f"
+_KEYCAP = "\u20e3"
+
+
+def _is_regional_indicator(codepoint: int) -> bool:
+    """Check if a codepoint is a Regional Indicator Symbol Letter (for flags)."""
+    return 0x1F1E6 <= codepoint <= 0x1F1FF
+
+
+def _is_emoji_modifier(codepoint: int) -> bool:
+    """Check if a codepoint is an emoji modifier (skin tone)."""
+    return 0x1F3FB <= codepoint <= 0x1F3FF
+
+
+def _is_tag_spec_char(codepoint: int) -> bool:
+    """Check if a codepoint is a tag spec character (used in some emoji sequences)."""
+    return 0xE0020 <= codepoint <= 0xE007E
+
+
+def _requires_emoji_clustering(text: str) -> bool:
+    """Return True if we should apply emoji/grapheme clustering heuristics.
+
+    For most text, summing widths per codepoint is sufficient (and faster).
+    Multi-codepoint emoji sequences require special handling to match how modern
+    terminals render them.
+    """
+    for character in text:
+        codepoint = ord(character)
+        if (
+            character in (_VS16, _ZWJ, _KEYCAP)
+            or _is_regional_indicator(codepoint)
+            or _is_emoji_modifier(codepoint)
+            or _is_tag_spec_char(codepoint)
+            or codepoint == 0xE007F  # CANCEL TAG
+        ):
+            return True
+    return False
+
+
+def _iter_cell_chunks(text: str) -> Iterator[Tuple[str, int]]:
+    """Yield (substring, cell_width) pairs without splitting common emoji sequences.
+
+    This is **not** a full implementation of Unicode grapheme clustering.
+    It focuses on the sequences most likely to cause incorrect terminal cell
+    measurements if treated per codepoint, including:
+
+    - Emoji presentation sequences (VS16, U+FE0F)
+    - ZWJ sequences (U+200D)
+    - Regional indicator pairs (flags)
+    - Keycap sequences (e.g. "1️⃣")
+    - Emoji modifier sequences (skin tones)
+    - Emoji tag sequences
+
+    For these sequences we yield a single chunk with width 2.
+    """
+    _cell_size = get_character_cell_size
+    length = len(text)
+    index = 0
+
+    while index < length:
+        start = index
+        character = text[index]
+        codepoint = ord(character)
+
+        # Flags are composed of two regional indicator codepoints.
+        if _is_regional_indicator(codepoint) and index + 1 < length:
+            if _is_regional_indicator(ord(text[index + 1])):
+                index += 2
+                yield text[start:index], 2
+                continue
+
+        # Keycap sequences: [0-9#*] [VS16]? U+20E3
+        if character in "0123456789#*":
+            index += 1
+            if index < length and text[index] == _VS16:
+                index += 1
+            if index < length and text[index] == _KEYCAP:
+                index += 1
+                yield text[start:index], 2
+                continue
+            index = start
+
+        # Parse a potential emoji component: base, optional variation selectors /
+        # combining marks (width 0), optional modifier, optional tag sequence.
+        index += 1
+        base_width = _cell_size(character)
+        is_emoji = base_width == 2
+        saw_vs16 = False
+
+        # Attach VS16 / combining marks etc. (width 0) to the base to avoid splitting.
+        while index < length:
+            next_character = text[index]
+            if next_character == _VS16:
+                saw_vs16 = True
+                index += 1
+                continue
+            if _cell_size(next_character) == 0 and next_character != _ZWJ:
+                index += 1
+                continue
+            break
+
+        if saw_vs16:
+            is_emoji = True
+
+        # Emoji modifier (skin tone)
+        if is_emoji and index < length and _is_emoji_modifier(ord(text[index])):
+            index += 1
+            while (
+                index < length
+                and _cell_size(text[index]) == 0
+                and text[index] != _ZWJ
+            ):
+                index += 1
+
+        # Emoji tag sequences: base + tags + CANCEL TAG
+        if is_emoji and index < length and _is_tag_spec_char(ord(text[index])):
+            while index < length and _is_tag_spec_char(ord(text[index])):
+                index += 1
+            if index < length and ord(text[index]) == 0xE007F:  # CANCEL TAG
+                index += 1
+
+        # ZWJ sequences join multiple emoji components into a single glyph.
+        if is_emoji and index < length and text[index] == _ZWJ:
+            while index < length and text[index] == _ZWJ:
+                index += 1
+                if index >= length:
+                    break
+
+                # Next component base
+                component_base = text[index]
+                index += 1
+                component_is_emoji = _cell_size(component_base) == 2
+                component_saw_vs16 = False
+
+                while index < length:
+                    next_character = text[index]
+                    if next_character == _VS16:
+                        component_saw_vs16 = True
+                        index += 1
+                        continue
+                    if _cell_size(next_character) == 0 and next_character != _ZWJ:
+                        index += 1
+                        continue
+                    break
+
+                if component_saw_vs16:
+                    component_is_emoji = True
+
+                if (
+                    component_is_emoji
+                    and index < length
+                    and _is_emoji_modifier(ord(text[index]))
+                ):
+                    index += 1
+                    while (
+                        index < length
+                        and _cell_size(text[index]) == 0
+                        and text[index] != _ZWJ
+                    ):
+                        index += 1
+
+                if (
+                    component_is_emoji
+                    and index < length
+                    and _is_tag_spec_char(ord(text[index]))
+                ):
+                    while index < length and _is_tag_spec_char(ord(text[index])):
+                        index += 1
+                    if index < length and ord(text[index]) == 0xE007F:
+                        index += 1
+
+                is_emoji = is_emoji or component_is_emoji
+
+            yield text[start:index], 2 if is_emoji else base_width
+            continue
+
+        yield text[start:index], 2 if is_emoji else base_width
+
+
 @lru_cache(4096)
 def cached_cell_len(text: str) -> int:
     """Get the number of cells required to display text.
@@ -45,7 +225,9 @@ def cached_cell_len(text: str) -> int:
     """
     if _is_single_cell_widths(text):
         return len(text)
-    return sum(map(get_character_cell_size, text))
+    if not _requires_emoji_clustering(text):
+        return sum(map(get_character_cell_size, text))
+    return sum(cell_size for _chunk, cell_size in _iter_cell_chunks(text))
 
 
 def cell_len(text: str, _cell_len: Callable[[str], int] = cached_cell_len) -> int:
@@ -61,7 +243,9 @@ def cell_len(text: str, _cell_len: Callable[[str], int] = cached_cell_len) -> in
         return _cell_len(text)
     if _is_single_cell_widths(text):
         return len(text)
-    return sum(map(get_character_cell_size, text))
+    if not _requires_emoji_clustering(text):
+        return sum(map(get_character_cell_size, text))
+    return sum(cell_size for _chunk, cell_size in _iter_cell_chunks(text))
 
 
 @lru_cache(maxsize=4096)
@@ -104,28 +288,47 @@ def set_cell_size(text: str, total: int) -> str:
 
     if total <= 0:
         return ""
-    cell_size = cell_len(text)
-    if cell_size == total:
-        return text
-    if cell_size < total:
-        return text + " " * (total - cell_size)
 
-    start = 0
-    end = len(text)
+    # Fast-path the previous behaviour when we don't need to consider multi-codepoint
+    # emoji sequences.
+    if not _requires_emoji_clustering(text):
+        cell_size = cell_len(text)
+        if cell_size == total:
+            return text
+        if cell_size < total:
+            return text + " " * (total - cell_size)
+
+        start = 0
+        end = len(text)
+
+        # Binary search until we find the right size
+        while True:
+            pos = (start + end) // 2
+            before = text[: pos + 1]
+            before_len = cell_len(before)
+            if before_len == total + 1 and cell_len(before[-1]) == 2:
+                return before[:-1] + " "
+            if before_len == total:
+                return before
+            if before_len > total:
+                end = pos
+            else:
+                start = pos
+
+    # Cluster-aware resize, to avoid splitting multi-codepoint emoji sequences.
+    current_width = 0
+    output: list[str] = []
+
+    for chunk, chunk_width in _iter_cell_chunks(text):
+        if current_width + chunk_width > total:
+            break
+        output.append(chunk)
+        current_width += chunk_width
 
-    # Binary search until we find the right size
-    while True:
-        pos = (start + end) // 2
-        before = text[: pos + 1]
-        before_len = cell_len(before)
-        if before_len == total + 1 and cell_len(before[-1]) == 2:
-            return before[:-1] + " "
-        if before_len == total:
-            return before
-        if before_len > total:
-            end = pos
-        else:
-            start = pos
+    if current_width < total:
+        output.append(" " * (total - current_width))
+
+    return "".join(output)
 
 
 def chop_cells(
@@ -142,33 +345,39 @@ def chop_cells(
         A list of strings such that each string in the list has cell width
         less than or equal to the available width.
     """
-    _get_character_cell_size = get_character_cell_size
+    # Use chunk iteration so we don't split common multi-codepoint emoji sequences.
     lines: list[list[str]] = [[]]
 
     append_new_line = lines.append
     append_to_last_line = lines[-1].append
 
     total_width = 0
 
-    for character in text:
-        cell_width = _get_character_cell_size(character)
+    for chunk, cell_width in _iter_cell_chunks(text):
         char_doesnt_fit = total_width + cell_width > width
 
         if char_doesnt_fit:
-            append_new_line([character])
+            append_new_line([chunk])
             append_to_last_line = lines[-1].append
             total_width = cell_width
         else:
-            append_to_last_line(character)
+            append_to_last_line(chunk)
             total_width += cell_width
 
     return ["".join(line) for line in lines]
 
 
 if __name__ == "__main__":  # pragma: no cover
     print(get_character_cell_size("😽"))
-    for line in chop_cells("""这是对亚洲语言支持的测试。面对模棱两可的想法，拒绝猜测的诱惑。""", 8):
+    for line in chop_cells(
+        """这是对亚洲语言支持的测试。面对模棱两可的想法，拒绝猜测的诱惑。""", 8
+    ):
         print(line)
     for n in range(80, 1, -1):
-        print(set_cell_size("""这是对亚洲语言支持的测试。面对模棱两可的想法，拒绝猜测的诱惑。""", n) + "|")
+        print(
+            set_cell_size(
+                """这是对亚洲语言支持的测试。面对模棱两可的想法，拒绝猜测的诱惑。""", n
+            )
+            + "|"
+        )
         print("x" * n)