From 45fbaec537b22f9ded2a50d3ec7a05ed80ff52ae Mon Sep 17 00:00:00 2001
From: klei22 <kauna@avalonwest.tech>
Date: Tue, 23 Dec 2025 18:49:53 -0800
Subject: [PATCH 01/10] Update script to emit stats and expand targets

Each script now has a common argument for stats_json, which emits the
number of tokens not transcribed (those which will be held for byte
tokenization).

Added an espeak2ipa.py script which can target any of the espeak
languages, and defaulting this to target shan for now.
---
 data/flores200-res/phoneticize.sh   |  10 +-
 data/template/utils/en2ipa.py       | 188 +++++++++++--
 data/template/utils/espeak2ipa.py   | 406 ++++++++++++++++++++++++++++
 data/template/utils/ja2ipa.py       | 172 ++++++------
 data/template/utils/ko_en_to_ipa.py | 138 ++++++++--
 data/template/utils/zh_to_ipa.py    | 168 ++++++++----
 6 files changed, 886 insertions(+), 196 deletions(-)
 create mode 100644 data/template/utils/espeak2ipa.py

diff --git a/data/flores200-res/phoneticize.sh b/data/flores200-res/phoneticize.sh
index 007c034696..b52f0c2789 100644
--- a/data/flores200-res/phoneticize.sh
+++ b/data/flores200-res/phoneticize.sh
@@ -1,7 +1,9 @@
 #!/bin/bash
 
 
-python3 utils/en2ipa.py text_eng_Latn.txt --mode text --output_file ipa_text_eng_Latn.txt
-python3 utils/ja2ipa.py text_jpn_Jpan.txt ipa_text_jpn_Jpan.txt --text_output --use_spacy --text_no_sentence
-python3 utils/ko_en_to_ipa.py text_kor_Hang.txt --text_input --text_output ipa_text_kor_Hang.txt
-python3 utils/zh_to_ipa.py text_zho_Hans.txt ipa_text_zho_Hans.txt --input_type text --no-wrapper
+python3 utils/en2ipa.py text_eng_Latn.txt --mode text --output_file ipa_text_eng_Latn.txt --no-wrapper --stats_json eng_stats.json
+python3 utils/ja2ipa.py text_jpn_Jpan.txt ipa_text_jpn_Jpan.txt --text_output --use_spacy --text_no_sentence --stats_json ja_stats.json
+python3 utils/ko_en_to_ipa.py text_kor_Hang.txt --text_input --text_output ipa_text_kor_Hang.txt --stats_json ko_stats.json
+python3 utils/zh_to_ipa.py text_zho_Hans.txt ipa_text_zho_Hans.txt --input_type text --no-wrapper --stats_json zh_stats.json
+python3 utils/espeak2ipa.py text_shn_Mymr.txt --mode text --output_file ipa_text_shan.txt --no-wrapper --stats_json shan_stats.json --lang shan
+
diff --git a/data/template/utils/en2ipa.py b/data/template/utils/en2ipa.py
index c6b9c84973..14708f1cc7 100644
--- a/data/template/utils/en2ipa.py
+++ b/data/template/utils/en2ipa.py
@@ -1,11 +1,11 @@
+#!/usr/bin/env python3
 # data/template/utils/en2ipa.py
 
 import subprocess
-from konlpy.tag import Okt
 import argparse
 import re
 import json
-from typing import List, Tuple
+from typing import List, Tuple, Optional, Dict, Any
 from rich.progress import Progress, BarColumn, TextColumn, TimeRemainingColumn, TimeElapsedColumn, MofNCompleteColumn
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import os
@@ -14,6 +14,21 @@
 counter = 0
 counter_lock = threading.Lock()
 
+WRAP_PREFIX = "[[[[["
+WRAP_SUFFIX = "]]]]]"
+
+_WORD_RE = re.compile(r'\w+|[^\w\s]', re.UNICODE)
+
+
+def utf8_len(s: str) -> int:
+    return len(s.encode("utf-8"))
+
+
+def is_english_token(tok: str) -> bool:
+    # Matches your original intent: “contains any a-z letter”
+    return any('a' <= ch.lower() <= 'z' for ch in tok)
+
+
 def transcribe_english(sentence, wrapper=False):
     """Transcribe an English sentence into its phonemes using espeak."""
     try:
@@ -24,28 +39,28 @@ def transcribe_english(sentence, wrapper=False):
         )
         transcription = result.stdout.strip().replace("ㆍ", " ")
         if "(en)" in transcription:
-            return f"[[[[[{sentence}]]]]]" if wrapper else sentence
+            return f"{WRAP_PREFIX}{sentence}{WRAP_SUFFIX}" if wrapper else sentence
         return transcription
     except Exception as e:
         return f"Error in transcribing English: {str(e)}"
 
+
 def handle_mixed_language(word, wrapper=False):
     """Handle a word with potential English, Language, or number content."""
     global counter
     if word.isdigit():
         return word
-    elif any('a' <= char.lower() <= 'z' for char in word):
+    elif is_english_token(word):
         return transcribe_english(word, wrapper=wrapper)
     else:
         if wrapper:
-            return "[[[[[" + word + "]]]]]"
+            return f"{WRAP_PREFIX}{word}{WRAP_SUFFIX}"
         else:
-            # thread-safe increment
+            # thread-safe increment (your existing stat)
             with counter_lock:
                 counter += 1
             return word
 
-_WORD_RE = re.compile(r'\w+|[^\w\s]', re.UNICODE)
 
 def transcribe_tokens_to_string(tokens: List[str], wrapper: bool) -> str:
     result = []
@@ -56,11 +71,33 @@ def transcribe_tokens_to_string(tokens: List[str], wrapper: bool) -> str:
             result.append(tok)
     return " ".join(result)
 
-def _worker_sentence(sentence: str, wrapper: bool) -> str:
-    """Worker function: tokenize and transcribe one sentence/line."""
+
+def _worker_sentence(sentence: str, wrapper: bool, stats: Optional[Dict[str, int]] = None) -> str:
+    """
+    Worker function: tokenize and transcribe one sentence/line.
+    If stats is provided, updates:
+      - transcribed_bytes: UTF-8 bytes of ORIGINAL tokens that were transcribed (English tokens)
+      - not_transcribed_bytes: UTF-8 bytes of ORIGINAL tokens not transcribed (digits, punctuation, non-English words)
+    Counts are based on ORIGINAL tokens, so wrapper overhead is excluded automatically.
+    """
     tokens = _WORD_RE.findall(sentence)
+
+    if stats is not None:
+        for tok in tokens:
+            b = utf8_len(tok)
+            if re.match(r'\w+', tok):
+                if tok.isdigit():
+                    stats["not_transcribed_bytes"] += b
+                elif is_english_token(tok):
+                    stats["transcribed_bytes"] += b
+                else:
+                    stats["not_transcribed_bytes"] += b
+            else:
+                stats["not_transcribed_bytes"] += b
+
     return transcribe_tokens_to_string(tokens, wrapper=wrapper)
 
+
 def _progress() -> Progress:
     return Progress(
         TextColumn("[bold blue]{task.description}"),
@@ -72,8 +109,16 @@ def _progress() -> Progress:
         transient=False,
     )
 
-def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa', wrapper=False,
-                            multithread: bool = False, workers: int = 0):
+
+def transcribe_multilingual(
+    sentences,
+    input_json_key=None,
+    output_json_key='ipa',
+    wrapper=False,
+    multithread: bool = False,
+    workers: int = 0,
+    stats: Optional[Dict[str, int]] = None,
+):
     """Transcribe multilingual sentences (JSON list mode)."""
     try:
         data = json.loads(sentences) if isinstance(sentences, str) else sentences
@@ -84,6 +129,12 @@ def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa
         if n == 0:
             return json.dumps(data, ensure_ascii=False, indent=4)
 
+        if stats is None:
+            stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+        else:
+            stats.setdefault("transcribed_bytes", 0)
+            stats.setdefault("not_transcribed_bytes", 0)
+
         if not multithread or workers <= 1:
             # Single-threaded path (original behavior)
             with _progress() as progress:
@@ -91,24 +142,35 @@ def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa
                 for item in data:
                     if input_json_key in item:
                         sentence = item[input_json_key]
-                        item[output_json_key] = _worker_sentence(sentence, wrapper)
+                        item[output_json_key] = _worker_sentence(sentence, wrapper, stats=stats)
                     progress.update(task, advance=1)
         else:
             # Multithreaded path with ordered assembly
             results: List[Tuple[int, str]] = [None] * n  # type: ignore
+
             # prepare jobs
             jobs = []
             for idx, item in enumerate(data):
                 sentence = item.get(input_json_key, "")
                 jobs.append((idx, sentence))
 
+            # Per-thread stats to avoid locks in hot path; merge at end
+            per_thread_stats: List[Dict[str, int]] = []
+
+            def submit_job(ex, idx_sentence):
+                idx, sentence = idx_sentence
+                local_stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+                per_thread_stats.append(local_stats)
+                return ex.submit(_worker_sentence, sentence, wrapper, local_stats), idx
+
             with _progress() as progress:
                 task = progress.add_task(f"Processing JSON items (mt x{workers})", total=n)
                 with ThreadPoolExecutor(max_workers=workers) as ex:
-                    future_to_idx = {
-                        ex.submit(_worker_sentence, sentence, wrapper): idx
-                        for idx, sentence in jobs
-                    }
+                    future_to_idx = {}
+                    for idx_sentence in jobs:
+                        fut, idx = submit_job(ex, idx_sentence)
+                        future_to_idx[fut] = idx
+
                     for fut in as_completed(future_to_idx):
                         idx = future_to_idx[fut]
                         try:
@@ -118,6 +180,11 @@ def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa
                         results[idx] = (idx, res)
                         progress.update(task, advance=1)
 
+            # merge per-thread stats
+            for st in per_thread_stats:
+                stats["transcribed_bytes"] += st.get("transcribed_bytes", 0)
+                stats["not_transcribed_bytes"] += st.get("not_transcribed_bytes", 0)
+
             # write back in original order
             for idx, item in enumerate(data):
                 if input_json_key in item:
@@ -129,32 +196,50 @@ def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa
 
     return json.dumps(data, ensure_ascii=False, indent=4)
 
-def transcribe_text_lines(lines: List[str], wrapper: bool, multithread: bool = False, workers: int = 0) -> List[str]:
+
+def transcribe_text_lines(
+    lines: List[str],
+    wrapper: bool,
+    multithread: bool = False,
+    workers: int = 0,
+    stats: Optional[Dict[str, int]] = None,
+) -> List[str]:
     """Transcribe a plain-text file line-by-line."""
     n = len(lines)
     if n == 0:
         return []
 
+    if stats is None:
+        stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+    else:
+        stats.setdefault("transcribed_bytes", 0)
+        stats.setdefault("not_transcribed_bytes", 0)
+
     if not multithread or workers <= 1:
-        # Single-threaded
         out_lines: List[str] = []
         with _progress() as progress:
             task = progress.add_task("Processing text lines", total=n)
             for line in lines:
                 raw = line.rstrip("\n")
-                out_lines.append(_worker_sentence(raw, wrapper))
+                out_lines.append(_worker_sentence(raw, wrapper, stats=stats))
                 progress.update(task, advance=1)
         return out_lines
     else:
-        # Multithreaded with ordered assembly
         out_lines: List[str] = [None] * n  # type: ignore
+
+        # Per-thread stats (avoid global lock)
+        per_thread_stats: List[Dict[str, int]] = [None] * n  # type: ignore
+
         with _progress() as progress:
             task = progress.add_task(f"Processing text lines (mt x{workers})", total=n)
             with ThreadPoolExecutor(max_workers=workers) as ex:
-                future_to_idx = {
-                    ex.submit(_worker_sentence, lines[i].rstrip("\n"), wrapper): i
-                    for i in range(n)
-                }
+                future_to_idx = {}
+                for i in range(n):
+                    local_stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+                    per_thread_stats[i] = local_stats
+                    fut = ex.submit(_worker_sentence, lines[i].rstrip("\n"), wrapper, local_stats)
+                    future_to_idx[fut] = i
+
                 for fut in as_completed(future_to_idx):
                     idx = future_to_idx[fut]
                     try:
@@ -162,8 +247,45 @@ def transcribe_text_lines(lines: List[str], wrapper: bool, multithread: bool = F
                     except Exception as e:
                         out_lines[idx] = f"Error: {e}"
                     progress.update(task, advance=1)
+
+        # merge stats
+        for st in per_thread_stats:
+            stats["transcribed_bytes"] += st.get("transcribed_bytes", 0)
+            stats["not_transcribed_bytes"] += st.get("not_transcribed_bytes", 0)
+
         return out_lines
 
+
+def finalize_and_print_stats(stats: Dict[str, int], stats_json_path: Optional[str] = None) -> Dict[str, Any]:
+    transcribed = int(stats.get("transcribed_bytes", 0))
+    not_tx = int(stats.get("not_transcribed_bytes", 0))
+    total = transcribed + not_tx
+    pct_tx = (transcribed / total * 100.0) if total else 0.0
+    pct_not = (not_tx / total * 100.0) if total else 0.0
+
+    out_stats: Dict[str, Any] = {
+        "transcribed_bytes": transcribed,
+        "not_transcribed_bytes": not_tx,
+        "total_bytes": total,
+        "pct_transcribed": pct_tx,
+        "pct_not_transcribed": pct_not,
+    }
+
+    print("\n=== Byte Coverage Stats (based on ORIGINAL tokens) ===")
+    print(f"Transcribed bytes      : {out_stats['transcribed_bytes']}")
+    print(f"Not transcribed bytes  : {out_stats['not_transcribed_bytes']}")
+    print(f"Total bytes (counted)  : {out_stats['total_bytes']}")
+    print(f"% transcribed          : {out_stats['pct_transcribed']:.2f}%")
+    print(f"% not transcribed      : {out_stats['pct_not_transcribed']:.2f}%")
+
+    if stats_json_path:
+        with open(stats_json_path, "w", encoding="utf-8") as sf:
+            json.dump(out_stats, sf, ensure_ascii=False, indent=2)
+        print(f"Stats JSON written to: {stats_json_path}")
+
+    return out_stats
+
+
 def main():
     parser = argparse.ArgumentParser(
         description='Transcribe multilingual content into IPA phonemes. Supports JSON list mode and plain-text line mode.'
@@ -192,12 +314,18 @@ def main():
     parser.add_argument("--workers", type=int, default=os.cpu_count() or 4,
                         help="Number of worker threads when --multithread is enabled (default: CPU count).")
 
+    # NEW: stats output
+    parser.add_argument("--stats_json", type=str, default=None,
+                        help="Optional: write byte coverage stats as JSON to this path (in addition to printing).")
+
     args = parser.parse_args()
 
     # clamp workers
     if args.workers is None or args.workers < 1:
         args.workers = 1
 
+    stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+
     try:
         if args.mode == 'json':
             if not args.input_json_key:
@@ -210,7 +338,8 @@ def main():
                 args.output_json_key,
                 wrapper=args.wrapper,
                 multithread=args.multithread,
-                workers=args.workers
+                workers=args.workers,
+                stats=stats,
             )
             if updated_json_data:
                 with open(args.input_file, 'w', encoding='utf-8') as f:
@@ -223,19 +352,24 @@ def main():
                 lines,
                 wrapper=args.wrapper,
                 multithread=args.multithread,
-                workers=args.workers
+                workers=args.workers,
+                stats=stats,
             )
             target_path = args.output_file if args.output_file else args.input_file
             with open(target_path, 'w', encoding='utf-8') as f:
                 f.write("\n".join(out_lines) + ("\n" if out_lines else ""))
             print(f"✅ Successfully wrote transcribed text to '{target_path}'")
 
-        print(f"📊 Stats: {counter} unparseable words")
+        finalize_and_print_stats(stats, stats_json_path=args.stats_json)
+
+        print(f"📊 Stats: {counter} unparseable words (only counted when --no-wrapper)")
+
     except FileNotFoundError:
         print(f"Error: Input file '{args.input_file}' not found.")
     except ValueError as ve:
         print(f"Error: {ve}")
 
+
 if __name__ == '__main__':
     main()
 
diff --git a/data/template/utils/espeak2ipa.py b/data/template/utils/espeak2ipa.py
new file mode 100644
index 0000000000..9c2864afc0
--- /dev/null
+++ b/data/template/utils/espeak2ipa.py
@@ -0,0 +1,406 @@
+#!/usr/bin/env python3
+# espeak2ipa.py
+#
+# Generic IPA transcription using espeak-ng for ANY supported voice.
+# Defaults to "shan" (you can override with --lang).
+#
+# Features (modeled after your en2ipa.py):
+# - JSON list mode (--mode json): in-place update of a JSON list file
+# - Text mode (--mode text): line-by-line transcription to output file (or overwrite input)
+# - Optional wrapping for untranscribed/unparseable tokens: [[[[[...]]]]]
+# - Multithreading with ordered output
+# - Rich progress bar
+# - Byte coverage stats (based on ORIGINAL tokens; wrapper overhead excluded)
+#
+# Notes:
+# - "transcribed_bytes" counts bytes of ORIGINAL tokens we ATTEMPT to send to espeak
+#   (tokens that contain at least one Unicode letter). Digits/punct count as not_transcribed.
+# - espeak-ng voices vary; if a voice is unavailable, you'll get an error/empty output.
+
+import subprocess
+import argparse
+import re
+import json
+from typing import List, Tuple, Optional, Dict, Any
+from rich.progress import Progress, BarColumn, TextColumn, TimeRemainingColumn, TimeElapsedColumn, MofNCompleteColumn
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import os
+import threading
+
+
+WRAP_PREFIX = "[[[[["
+WRAP_SUFFIX = "]]]]]"
+_WORD_RE = re.compile(r"\w+|[^\w\s]", re.UNICODE)
+
+counter_unparseable = 0
+counter_lock = threading.Lock()
+
+
+def utf8_len(s: str) -> int:
+    return len(s.encode("utf-8"))
+
+
+def token_has_letter(tok: str) -> bool:
+    # "letter" across scripts (Latin, Han, Kana, Arabic, etc.)
+    return any(ch.isalpha() for ch in tok)
+
+
+def transcribe_espeak(token: str, lang: str, wrapper: bool = False) -> str:
+    """
+    Transcribe a token via espeak-ng.
+    If transcription fails (empty output / exception), return wrapped or original token.
+    """
+    global counter_unparseable
+    try:
+        result = subprocess.run(
+            ["espeak-ng", "-q", "-v", lang, "--ipa", token],
+            capture_output=True,
+            text=True
+        )
+        out = (result.stdout or "").strip().replace("ㆍ", " ")
+        if not out:
+            if wrapper:
+                return f"{WRAP_PREFIX}{token}{WRAP_SUFFIX}"
+            with counter_lock:
+                counter_unparseable += 1
+            return token
+        return out
+    except Exception:
+        if wrapper:
+            return f"{WRAP_PREFIX}{token}{WRAP_SUFFIX}"
+        with counter_lock:
+            counter_unparseable += 1
+        return token
+
+
+def handle_token(tok: str, lang: str, wrapper: bool) -> str:
+    """
+    Decide whether to transcribe:
+      - digits -> passthrough
+      - tokens with any letter -> transcribe via espeak
+      - otherwise (punct/symbol) -> passthrough
+    """
+    if tok.isdigit():
+        return tok
+    if token_has_letter(tok):
+        return transcribe_espeak(tok, lang=lang, wrapper=wrapper)
+    return tok
+
+
+def tokens_to_ipa_string(tokens: List[str], lang: str, wrapper: bool) -> str:
+    out: List[str] = []
+    for tok in tokens:
+        if re.match(r"\w+", tok):
+            out.append(handle_token(tok, lang=lang, wrapper=wrapper))
+        else:
+            out.append(tok)
+    return " ".join(out)
+
+
+def _worker_sentence(sentence: str, lang: str, wrapper: bool, stats: Optional[Dict[str, int]] = None) -> str:
+    """
+    Tokenize & transcribe one sentence/line.
+    If stats is provided, updates byte counts based on ORIGINAL tokens:
+      - transcribed_bytes: tokens containing at least one letter
+      - not_transcribed_bytes: digits + punctuation/symbols + other \w tokens with no letters
+    """
+    tokens = _WORD_RE.findall(sentence)
+
+    if stats is not None:
+        for tok in tokens:
+            b = utf8_len(tok)
+            if re.match(r"\w+", tok):
+                if tok.isdigit():
+                    stats["not_transcribed_bytes"] += b
+                elif token_has_letter(tok):
+                    stats["transcribed_bytes"] += b
+                else:
+                    stats["not_transcribed_bytes"] += b
+            else:
+                stats["not_transcribed_bytes"] += b
+
+    return tokens_to_ipa_string(tokens, lang=lang, wrapper=wrapper)
+
+
+def _progress() -> Progress:
+    return Progress(
+        TextColumn("[bold blue]{task.description}"),
+        BarColumn(),
+        MofNCompleteColumn(),
+        TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+        TimeElapsedColumn(),
+        TimeRemainingColumn(),
+        transient=False,
+    )
+
+
+def transcribe_json_list(
+    json_text_or_obj,
+    input_json_key: str,
+    output_json_key: str,
+    lang: str,
+    wrapper: bool,
+    multithread: bool,
+    workers: int,
+    stats: Optional[Dict[str, int]] = None,
+) -> Optional[str]:
+    """
+    JSON list mode: reads a JSON list of objects, writes output_json_key for each object.
+    Returns JSON string (pretty printed).
+    """
+    try:
+        data = json.loads(json_text_or_obj) if isinstance(json_text_or_obj, str) else json_text_or_obj
+        if not isinstance(data, list):
+            raise ValueError("JSON data should be a list of objects.")
+    except Exception as e:
+        print(f"Error: {e}")
+        return None
+
+    n = len(data)
+    if stats is None:
+        stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+    else:
+        stats.setdefault("transcribed_bytes", 0)
+        stats.setdefault("not_transcribed_bytes", 0)
+
+    if n == 0:
+        return json.dumps(data, ensure_ascii=False, indent=4)
+
+    if not multithread or workers <= 1:
+        with _progress() as progress:
+            task = progress.add_task("Processing JSON items", total=n)
+            for item in data:
+                if input_json_key in item:
+                    sentence = item[input_json_key]
+                    item[output_json_key] = _worker_sentence(sentence, lang=lang, wrapper=wrapper, stats=stats)
+                progress.update(task, advance=1)
+    else:
+        # ordered results
+        results: List[Tuple[int, str]] = [None] * n  # type: ignore
+        per_item_stats: List[Dict[str, int]] = [None] * n  # type: ignore
+
+        jobs = [(i, data[i].get(input_json_key, "")) for i in range(n)]
+
+        with _progress() as progress:
+            task = progress.add_task(f"Processing JSON items (mt x{workers})", total=n)
+            with ThreadPoolExecutor(max_workers=workers) as ex:
+                future_to_idx = {}
+                for idx, sentence in jobs:
+                    local_stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+                    per_item_stats[idx] = local_stats
+                    fut = ex.submit(_worker_sentence, sentence, lang, wrapper, local_stats)
+                    future_to_idx[fut] = idx
+
+                for fut in as_completed(future_to_idx):
+                    idx = future_to_idx[fut]
+                    try:
+                        res = fut.result()
+                    except Exception as e:
+                        res = f"Error: {e}"
+                    results[idx] = (idx, res)
+                    progress.update(task, advance=1)
+
+        # merge stats
+        for st in per_item_stats:
+            stats["transcribed_bytes"] += st.get("transcribed_bytes", 0)
+            stats["not_transcribed_bytes"] += st.get("not_transcribed_bytes", 0)
+
+        # write back in original order
+        for idx, item in enumerate(data):
+            if input_json_key in item:
+                item[output_json_key] = results[idx][1]
+
+    return json.dumps(data, ensure_ascii=False, indent=4)
+
+
+def transcribe_text_lines(
+    lines: List[str],
+    lang: str,
+    wrapper: bool,
+    multithread: bool,
+    workers: int,
+    stats: Optional[Dict[str, int]] = None,
+) -> List[str]:
+    """
+    Text mode: input is one sentence per line. Output is one IPA line per input line.
+    """
+    n = len(lines)
+    if stats is None:
+        stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+    else:
+        stats.setdefault("transcribed_bytes", 0)
+        stats.setdefault("not_transcribed_bytes", 0)
+
+    if n == 0:
+        return []
+
+    if not multithread or workers <= 1:
+        out_lines: List[str] = []
+        with _progress() as progress:
+            task = progress.add_task("Processing text lines", total=n)
+            for line in lines:
+                raw = line.rstrip("\n")
+                out_lines.append(_worker_sentence(raw, lang=lang, wrapper=wrapper, stats=stats))
+                progress.update(task, advance=1)
+        return out_lines
+
+    out_lines: List[str] = [None] * n  # type: ignore
+    per_item_stats: List[Dict[str, int]] = [None] * n  # type: ignore
+
+    with _progress() as progress:
+        task = progress.add_task(f"Processing text lines (mt x{workers})", total=n)
+        with ThreadPoolExecutor(max_workers=workers) as ex:
+            future_to_idx = {}
+            for i in range(n):
+                local_stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+                per_item_stats[i] = local_stats
+                fut = ex.submit(_worker_sentence, lines[i].rstrip("\n"), lang, wrapper, local_stats)
+                future_to_idx[fut] = i
+
+            for fut in as_completed(future_to_idx):
+                i = future_to_idx[fut]
+                try:
+                    out_lines[i] = fut.result()
+                except Exception as e:
+                    out_lines[i] = f"Error: {e}"
+                progress.update(task, advance=1)
+
+    for st in per_item_stats:
+        stats["transcribed_bytes"] += st.get("transcribed_bytes", 0)
+        stats["not_transcribed_bytes"] += st.get("not_transcribed_bytes", 0)
+
+    return out_lines
+
+
+def finalize_and_print_stats(stats: Dict[str, int], stats_json_path: Optional[str] = None) -> Dict[str, Any]:
+    transcribed = int(stats.get("transcribed_bytes", 0))
+    not_tx = int(stats.get("not_transcribed_bytes", 0))
+    total = transcribed + not_tx
+    pct_tx = (transcribed / total * 100.0) if total else 0.0
+    pct_not = (not_tx / total * 100.0) if total else 0.0
+
+    out_stats: Dict[str, Any] = {
+        "transcribed_bytes": transcribed,
+        "not_transcribed_bytes": not_tx,
+        "total_bytes": total,
+        "pct_transcribed": pct_tx,
+        "pct_not_transcribed": pct_not,
+        "unparseable_tokens": counter_unparseable,
+    }
+
+    print("\n=== Byte Coverage Stats (based on ORIGINAL tokens) ===")
+    print(f"Transcribed bytes      : {out_stats['transcribed_bytes']}")
+    print(f"Not transcribed bytes  : {out_stats['not_transcribed_bytes']}")
+    print(f"Total bytes (counted)  : {out_stats['total_bytes']}")
+    print(f"% transcribed          : {out_stats['pct_transcribed']:.2f}%")
+    print(f"% not transcribed      : {out_stats['pct_not_transcribed']:.2f}%")
+    print(f"Unparseable tokens     : {out_stats['unparseable_tokens']}")
+
+    if stats_json_path:
+        with open(stats_json_path, "w", encoding="utf-8") as sf:
+            json.dump(out_stats, sf, ensure_ascii=False, indent=2)
+        print(f"Stats JSON written to: {stats_json_path}")
+
+    return out_stats
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generic IPA transcription using espeak-ng for any supported voice (default: shan). "
+                    "Supports JSON list mode and plain-text line mode, with byte coverage stats."
+    )
+    parser.add_argument("input_file", type=str, help="Path to the input file (JSON list or plain text).")
+
+    # Language / voice
+    parser.add_argument("--lang", default="shan",
+                        help="espeak-ng voice/language code (default: shan). Example: en, fr, de, es, ja, zh, etc.")
+
+    # Mode selection
+    parser.add_argument("--mode", choices=["json", "text"], default="json",
+                        help='Processing mode. "json" expects a JSON list; "text" treats file as plain text.')
+
+    # JSON mode params
+    parser.add_argument("--input_json_key", type=str,
+                        help="JSON key to read sentences from (required for --mode json).")
+    parser.add_argument("--output_json_key", type=str, default="ipa",
+                        help='JSON key to store IPA (default: "ipa").')
+
+    # Text mode params
+    parser.add_argument("--output_file", type=str, default=None,
+                        help="Output file path for text mode. Defaults to overwriting input.")
+
+    # Wrapper option
+    parser.add_argument("--wrapper", default=False, action=argparse.BooleanOptionalAction,
+                        help="Wrap unparseable tokens with [[[[[...]]]]] (default: false).")
+
+    # Multithreading options
+    parser.add_argument("--multithread", default=False, action=argparse.BooleanOptionalAction,
+                        help="Enable multithreading while preserving output order.")
+    parser.add_argument("--workers", type=int, default=os.cpu_count() or 4,
+                        help="Number of worker threads when --multithread is enabled (default: CPU count).")
+
+    # Stats output
+    parser.add_argument("--stats_json", type=str, default=None,
+                        help="Optional: write byte coverage stats as JSON to this path (in addition to printing).")
+
+    args = parser.parse_args()
+
+    # clamp workers
+    if args.workers is None or args.workers < 1:
+        args.workers = 1
+
+    stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+
+    try:
+        if args.mode == "json":
+            if not args.input_json_key:
+                raise ValueError("--input_json_key is required when --mode json")
+
+            with open(args.input_file, "r", encoding="utf-8") as f:
+                input_content = f.read()
+
+            updated_json = transcribe_json_list(
+                input_content,
+                input_json_key=args.input_json_key,
+                output_json_key=args.output_json_key,
+                lang=args.lang,
+                wrapper=args.wrapper,
+                multithread=args.multithread,
+                workers=args.workers,
+                stats=stats,
+            )
+            if updated_json is not None:
+                # matches your existing style: overwrite JSON input file
+                with open(args.input_file, "w", encoding="utf-8") as f:
+                    f.write(updated_json)
+                print(f"✅ Successfully updated JSON data in '{args.input_file}'")
+
+        else:
+            with open(args.input_file, "r", encoding="utf-8") as f:
+                lines = f.readlines()
+
+            out_lines = transcribe_text_lines(
+                lines,
+                lang=args.lang,
+                wrapper=args.wrapper,
+                multithread=args.multithread,
+                workers=args.workers,
+                stats=stats,
+            )
+
+            target_path = args.output_file if args.output_file else args.input_file
+            with open(target_path, "w", encoding="utf-8") as f:
+                f.write("\n".join(out_lines) + ("\n" if out_lines else ""))
+            print(f"✅ Successfully wrote transcribed text to '{target_path}'")
+
+        finalize_and_print_stats(stats, stats_json_path=args.stats_json)
+
+    except FileNotFoundError:
+        print(f"Error: Input file '{args.input_file}' not found.")
+    except ValueError as ve:
+        print(f"Error: {ve}")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/data/template/utils/ja2ipa.py b/data/template/utils/ja2ipa.py
index 513868c490..fb9ea522fd 100644
--- a/data/template/utils/ja2ipa.py
+++ b/data/template/utils/ja2ipa.py
@@ -367,14 +367,6 @@ def hiragana_to_ipa(text: str) -> str:
 
 # ========== 2) MeCab Morphological Tokenization ==========
 def mecab_spaced_reading(text: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
-    """
-    Use MeCab for morphological analysis. Return four strings:
-      1) spaced_original: original surface forms joined by spaces.
-      2) spaced_hira_subbed: token text with the "は" particle overridden to "わ" where applicable, then converted to Hiragana.
-      3) spaced_hira_original: the Hiragana conversion of the original spaced text.
-      4) pos_tags: part-of-speech tags for each token (joined by spaces).
-    If MeCab is not available, return (None, None, None, None).
-    """
     if not MECAB_AVAILABLE:
         return None, None, None, None
 
@@ -389,10 +381,9 @@ def mecab_spaced_reading(text: str) -> Tuple[Optional[str], Optional[str], Optio
         surface = node.surface
         features = node.feature.split(",")
         if len(features) >= 1:
-            pos = features[0]  # e.g. 助詞, 名詞, 動詞...
+            pos = features[0]
             tokens_original.append(surface)
             pos_tokens.append(pos)
-            # Override if particle "は" (助詞)
             if pos == "助詞" and surface == "は":
                 tokens_for_hira.append("わ")
             else:
@@ -415,21 +406,12 @@ def mecab_spaced_reading(text: str) -> Tuple[Optional[str], Optional[str], Optio
 # ========== 3) spaCy Morphological Tokenization ==========
 _spacy_nlp = None
 def load_spacy_japanese():
-    """Lazy-load the spaCy model. Requires 'ja_core_news_sm' or similar to be installed."""
     global _spacy_nlp
     if _spacy_nlp is None:
         _spacy_nlp = spacy.load("ja_core_news_sm")
     return _spacy_nlp
 
 def spacy_spaced_reading(text: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
-    """
-    Use spaCy morphological analysis. Return four strings:
-      1) spaced_original: original token texts joined by spaces.
-      2) spaced_hira_subbed: token texts (with "は" overridden to "わ" when pos_ is ADP) converted to Hiragana.
-      3) spaced_hira_original: Hiragana conversion of the original spaced token texts.
-      4) pos_tags: part-of-speech tags (using token.pos_) joined by spaces.
-    If spaCy is not available, return (None, None, None, None).
-    """
     if not SPACY_AVAILABLE:
         return None, None, None, None
 
@@ -459,7 +441,6 @@ def spacy_spaced_reading(text: str) -> Tuple[Optional[str], Optional[str], Optio
 
 # ========== 4) Unified "get spaced reading" function ==========
 def get_spaced_reading(text: str, method: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
-    """Return (spaced_original, spaced_hira_subbed, spaced_hira_original, pos_tags) using the chosen method."""
     if method == "mecab":
         return mecab_spaced_reading(text)
     elif method == "spacy":
@@ -480,10 +461,6 @@ def write_text_output(
     include_sentence: bool = True,
     sep: str = "\t"
 ) -> None:
-    """
-    Write a plain-text file, one line per entry.
-    Default format: "<sentence>\\t<field>"
-    """
     with open(output_file, "w", encoding="utf-8") as fout:
         for obj in out_array:
             sent = obj.get("sentence", "")
@@ -494,6 +471,10 @@ def write_text_output(
                 fout.write(f"{val}\n")
 
 
+def utf8_len(s: str) -> int:
+    return len(s.encode("utf-8"))
+
+
 # ========== 6) Main Processing Logic ==========
 def process_japanese_text(
     input_file: str,
@@ -505,19 +486,16 @@ def process_japanese_text(
     text_field: str = "spaced_ipa",
     text_include_sentence: bool = True,
     text_sep: str = "\t",
+    stats_json: Optional[str] = None,
 ):
     """
-    Processes Japanese text to IPA.
+    Same behavior as before, plus byte coverage stats:
+      - transcribed_bytes: UTF-8 bytes of ORIGINAL tokens that are considered "transcribed"
+        (anything containing Japanese script: Hiragana/Katakana/Kanji)
+      - not_transcribed_bytes: UTF-8 bytes of ORIGINAL tokens not transcribed (Latin, digits, punctuation)
 
-    INPUT MODES (same defaults as original):
-      - If json_inplace_update=True: treat input as JSON array with "sentence" fields.
-      - Else: treat input as plain text (one sentence per line).
-
-    OUTPUT MODES (new):
-      - Default (unchanged): write JSON array to output_file
-      - If output_text=True: write plain text to output_file using selected field(s)
+    Counts are based on ORIGINAL tokens (wrapper overhead doesn't exist in this script).
     """
-    # Decide morphological method:
     if use_mecab and use_spacy:
         print("Error: Please choose either MeCab or spaCy, not both.")
         sys.exit(1)
@@ -528,10 +506,33 @@ def process_japanese_text(
     else:
         morph_method = None
 
+    stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
     out_array: List[Dict[str, Any]] = []
 
+    def is_japanese_char(ch: str) -> bool:
+        o = ord(ch)
+        # Hiragana, Katakana, CJK Unified Ideographs (basic), plus common punctuation blocks are excluded intentionally.
+        return (0x3040 <= o <= 0x309F) or (0x30A0 <= o <= 0x30FF) or (0x4E00 <= o <= 0x9FFF)
+
+    def is_japanese_token(tok: str) -> bool:
+        return any(is_japanese_char(ch) for ch in tok)
+
+    def count_sentence_bytes(sentence: str) -> None:
+        # Tokenize similarly to your KR/ZH scripts for consistent accounting
+        toks = re.findall(r"\w+|[^\w\s]", sentence, re.UNICODE)
+        for tok in toks:
+            b = utf8_len(tok)
+            if re.match(r"\w+", tok):
+                if tok.isdigit():
+                    stats["not_transcribed_bytes"] += b
+                elif is_japanese_token(tok):
+                    stats["transcribed_bytes"] += b
+                else:
+                    stats["not_transcribed_bytes"] += b
+            else:
+                stats["not_transcribed_bytes"] += b
+
     if json_inplace_update:
-        # JSON input: process as JSON array.
         try:
             with open(input_file, "r", encoding="utf-8") as fin:
                 data = json.load(fin)
@@ -541,6 +542,8 @@ def process_japanese_text(
                     continue
 
                 original_text = entry["sentence"]
+                count_sentence_bytes(original_text)
+
                 hira_unspaced = to_hiragana(original_text)
                 ipa_unspaced = hiragana_to_ipa(hira_unspaced)
 
@@ -558,11 +561,7 @@ def process_japanese_text(
                     out_obj["spaced_original"] = spaced_original if spaced_original is not None else ""
                     out_obj["spaced_hira_subbed"] = spaced_hira_subbed if spaced_hira_subbed is not None else ""
                     out_obj["pos_tags"] = pos_tags if pos_tags is not None else ""
-
-                    ipa_spaced = ""
-                    if out_obj["spaced_hira_subbed"]:
-                        ipa_spaced = hiragana_to_ipa(out_obj["spaced_hira_subbed"])
-                    out_obj["spaced_ipa"] = ipa_spaced
+                    out_obj["spaced_ipa"] = hiragana_to_ipa(out_obj["spaced_hira_subbed"]) if out_obj["spaced_hira_subbed"] else ""
 
                 out_array.append(out_obj)
 
@@ -577,7 +576,6 @@ def process_japanese_text(
             return
 
     else:
-        # Plain text input: each non-blank line is treated as a sentence.
         try:
             with open(input_file, "r", encoding="utf-8") as fin:
                 lines = fin.readlines()
@@ -588,6 +586,8 @@ def process_japanese_text(
                     continue
 
                 original_text = line
+                count_sentence_bytes(original_text)
+
                 hira_unspaced = to_hiragana(original_text)
                 ipa_unspaced = hiragana_to_ipa(hira_unspaced)
 
@@ -605,11 +605,7 @@ def process_japanese_text(
                     out_obj["spaced_original"] = spaced_original if spaced_original is not None else ""
                     out_obj["spaced_hira_subbed"] = spaced_hira_subbed if spaced_hira_subbed is not None else ""
                     out_obj["pos_tags"] = pos_tags if pos_tags is not None else ""
-
-                    ipa_spaced = ""
-                    if out_obj["spaced_hira_subbed"]:
-                        ipa_spaced = hiragana_to_ipa(out_obj["spaced_hira_subbed"])
-                    out_obj["spaced_ipa"] = ipa_spaced
+                    out_obj["spaced_ipa"] = hiragana_to_ipa(out_obj["spaced_hira_subbed"]) if out_obj["spaced_hira_subbed"] else ""
 
                 out_array.append(out_obj)
 
@@ -620,7 +616,7 @@ def process_japanese_text(
             print(f"An error occurred: {e}")
             return
 
-    # OUTPUT (default unchanged: JSON)
+    # OUTPUT (unchanged)
     if output_text:
         write_text_output(
             output_file=output_file,
@@ -632,14 +628,44 @@ def process_japanese_text(
     else:
         write_json_array(output_file=output_file, out_array=out_array)
 
+    # Print + optional write stats
+    transcribed = int(stats["transcribed_bytes"])
+    not_tx = int(stats["not_transcribed_bytes"])
+    total = transcribed + not_tx
+    pct_tx = (transcribed / total * 100.0) if total else 0.0
+    pct_not = (not_tx / total * 100.0) if total else 0.0
+
+    out_stats = {
+        "transcribed_bytes": transcribed,
+        "not_transcribed_bytes": not_tx,
+        "total_bytes": total,
+        "pct_transcribed": pct_tx,
+        "pct_not_transcribed": pct_not,
+    }
+
+    print("\n=== Byte Coverage Stats (based on ORIGINAL tokens) ===")
+    print(f"Transcribed bytes      : {out_stats['transcribed_bytes']}")
+    print(f"Not transcribed bytes  : {out_stats['not_transcribed_bytes']}")
+    print(f"Total bytes (counted)  : {out_stats['total_bytes']}")
+    print(f"% transcribed          : {out_stats['pct_transcribed']:.2f}%")
+    print(f"% not transcribed      : {out_stats['pct_not_transcribed']:.2f}%")
+
+    if stats_json:
+        with open(stats_json, "w", encoding="utf-8") as sf:
+            json.dump(out_stats, sf, ensure_ascii=False, indent=2)
+        print(f"Stats JSON written to: {stats_json}")
+
 
 # ========== 7) Command-Line Entry Point ==========
 if __name__ == "__main__":
+    import re  # local import to avoid changing your top imports too much
+
     parser = argparse.ArgumentParser(
         description=(
             "Convert JP text to IPA with optional morphological spacing and POS tagging.\n"
             "DEFAULT behavior matches original: input may be JSON (-j) or plain text, output is JSON array.\n"
-            "NEW: you can output plain text with --text_output."
+            "You can output plain text with --text_output.\n"
+            "NEW: prints byte coverage stats and can write them with --stats_json."
         )
     )
     parser.add_argument(
@@ -658,42 +684,27 @@ def process_japanese_text(
     parser.add_argument(
         "-j", "--json_inplace_update",
         action="store_true",
+        help="Treat input file as JSON array and update each entry."
     )
 
     group = parser.add_mutually_exclusive_group()
-    group.add_argument(
-        "--use_mecab",
-        action="store_true",
-        help="Use MeCab for morphological tokenization (and forcing 'は' => 'わ')."
-    )
-    group.add_argument(
-        "--use_spacy",
-        action="store_true",
-        help="Use spaCy for morphological tokenization (and forcing 'は' => 'わ')."
-    )
-
-    # NEW OUTPUT MODE
-    parser.add_argument(
-        "--text_output",
-        action="store_true",
-        help="Write a plain-text output file (one line per sentence) instead of JSON."
-    )
-    parser.add_argument(
-        "--text_field",
-        default="spaced_ipa",
-        help="Which field to emit in --text_output mode (default: spaced_ipa). "
-             "Common choices: unspaced_ipa, spaced_ipa, spaced_hira_subbed, pos_tags."
-    )
-    parser.add_argument(
-        "--text_no_sentence",
-        action="store_true",
-        help="In --text_output mode, emit only the selected field (omit the original sentence)."
-    )
-    parser.add_argument(
-        "--text_sep",
-        default="\t",
-        help="Separator used between sentence and field in --text_output mode (default: tab)."
-    )
+    group.add_argument("--use_mecab", action="store_true",
+                       help="Use MeCab for morphological tokenization (and forcing 'は' => 'わ').")
+    group.add_argument("--use_spacy", action="store_true",
+                       help="Use spaCy for morphological tokenization (and forcing 'は' => 'わ').")
+
+    parser.add_argument("--text_output", action="store_true",
+                        help="Write a plain-text output file (one line per sentence) instead of JSON.")
+    parser.add_argument("--text_field", default="spaced_ipa",
+                        help="Which field to emit in --text_output mode (default: spaced_ipa). "
+                             "Common: unspaced_ipa, spaced_ipa, spaced_hira_subbed, pos_tags.")
+    parser.add_argument("--text_no_sentence", action="store_true",
+                        help="In --text_output mode, emit only the selected field (omit the original sentence).")
+    parser.add_argument("--text_sep", default="\t",
+                        help="Separator used between sentence and field in --text_output mode (default: tab).")
+
+    parser.add_argument("--stats_json", type=str, default=None,
+                        help="Optional: write byte coverage stats as JSON to this path (in addition to printing).")
 
     args = parser.parse_args()
 
@@ -707,5 +718,6 @@ def process_japanese_text(
         text_field=args.text_field,
         text_include_sentence=(not args.text_no_sentence),
         text_sep=args.text_sep,
+        stats_json=args.stats_json,
     )
 
diff --git a/data/template/utils/ko_en_to_ipa.py b/data/template/utils/ko_en_to_ipa.py
index 28e90963a7..4392b63a06 100644
--- a/data/template/utils/ko_en_to_ipa.py
+++ b/data/template/utils/ko_en_to_ipa.py
@@ -5,6 +5,17 @@
 import re
 import json
 
+WRAP_PREFIX = "[[[[["
+WRAP_SUFFIX = "]]]]]"
+
+
+def utf8_len(s: str) -> int:
+    return len(s.encode("utf-8"))
+
+
+def is_korean_token(token: str) -> bool:
+    return any('가' <= ch <= '힣' for ch in token)
+
 
 def transcribe_korean(sentence, wrapper=False):
     """Transcribe a Korean sentence into its phonemes using KoNLPy (Okt) + espeak-ng."""
@@ -25,7 +36,7 @@ def transcribe_korean(sentence, wrapper=False):
         # Check for failed transcription markers
         if "(en)" in transcription or "(ko)" in transcription:
             if wrapper:
-                return "[[[[[" + sentence + "]]]]]"
+                return f"{WRAP_PREFIX}{sentence}{WRAP_SUFFIX}"
             return sentence
 
         return transcription
@@ -37,40 +48,69 @@ def transcribe_korean(sentence, wrapper=False):
 
 def handle_mixed_language(word, wrapper=False):
     """Handle a word with potential Korean, other language, or number content."""
-    if word.isdigit():  # Detect numbers (pass through unchanged)
+    if word.isdigit():  # numbers pass through unchanged
         return word
-    elif any('가' <= char <= '힣' for char in word):  # Detect Korean
+    elif is_korean_token(word):
         return transcribe_korean(word, wrapper=wrapper)
-    else:  # Non-Korean word
+    else:  # Non-Korean
         if wrapper:
-            return "[[[[[" + word + "]]]]]"
+            return f"{WRAP_PREFIX}{word}{WRAP_SUFFIX}"
         return word
 
 
-def transcribe_plain_text(text, wrapper=False):
-    """Transcribe a plain text string into IPA, leaving non-Korean as-is (or wrapped)."""
-    result = []
+def transcribe_plain_text(
+    text,
+    wrapper=False,
+    stats=None,
+):
+    """
+    Transcribe a plain text string into IPA, leaving non-Korean as-is (or wrapped).
+
+    If stats dict is provided, it will be updated with:
+      - transcribed_bytes: UTF-8 bytes of ORIGINAL tokens that were transcribed (Korean tokens only)
+      - not_transcribed_bytes: UTF-8 bytes of ORIGINAL tokens not transcribed (includes Latin, digits, punctuation)
+    Counts are based on ORIGINAL tokens, so wrapper overhead is excluded automatically.
+    """
+    if stats is None:
+        stats = {}
+
+    stats.setdefault("transcribed_bytes", 0)
+    stats.setdefault("not_transcribed_bytes", 0)
+
+    out = []
     words = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
-    for word in words:
-        if re.match(r'\w+', word):
-            result.append(handle_mixed_language(word, wrapper=wrapper))
+    for tok in words:
+        tok_bytes = utf8_len(tok)
+
+        if re.match(r'\w+', tok):
+            if tok.isdigit():
+                stats["not_transcribed_bytes"] += tok_bytes
+            elif is_korean_token(tok):
+                stats["transcribed_bytes"] += tok_bytes
+            else:
+                stats["not_transcribed_bytes"] += tok_bytes
+
+            out.append(handle_mixed_language(tok, wrapper=wrapper))
         else:
-            result.append(word)
-    return " ".join(result)
+            # punctuation/symbols
+            stats["not_transcribed_bytes"] += tok_bytes
+            out.append(tok)
+
+    return " ".join(out)
 
 
-def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa', wrapper=False):
+def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa', wrapper=False, stats=None):
     """
     Transcribe multilingual sentences and update JSON data directly.
 
-    Args:
-        sentences: JSON string or a loaded JSON object.
-        input_json_key: Key to extract sentences from in a JSON.
-        output_json_key: Key to store IPA transcription in the JSON (default: 'ipa').
-
-    Returns:
-        The modified JSON string with IPA transcriptions added.
+    Returns the modified JSON string with IPA transcriptions added.
+    If stats dict is provided, it will be updated with byte coverage counts.
     """
+    if stats is None:
+        stats = {}
+    stats.setdefault("transcribed_bytes", 0)
+    stats.setdefault("not_transcribed_bytes", 0)
+
     try:
         data = json.loads(sentences) if isinstance(sentences, str) else sentences
         if not isinstance(data, list):
@@ -79,8 +119,8 @@ def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa
         for item in data:
             if input_json_key in item:
                 sentence = item[input_json_key]
-                transcription_result = transcribe_plain_text(sentence, wrapper=wrapper)
-                item[output_json_key] = transcription_result  # Update directly
+                transcription_result = transcribe_plain_text(sentence, wrapper=wrapper, stats=stats)
+                item[output_json_key] = transcription_result
                 print(transcription_result)
             else:
                 print(f"Warning: Key '{input_json_key}' not found in item: {item}")
@@ -92,9 +132,39 @@ def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa
     return json.dumps(data, ensure_ascii=False, indent=4)
 
 
+def finalize_and_print_stats(stats, stats_json_path=None):
+    transcribed = int(stats.get("transcribed_bytes", 0))
+    not_tx = int(stats.get("not_transcribed_bytes", 0))
+    total = transcribed + not_tx
+    pct_tx = (transcribed / total * 100.0) if total else 0.0
+    pct_not = (not_tx / total * 100.0) if total else 0.0
+
+    out_stats = {
+        "transcribed_bytes": transcribed,
+        "not_transcribed_bytes": not_tx,
+        "total_bytes": total,
+        "pct_transcribed": pct_tx,
+        "pct_not_transcribed": pct_not,
+    }
+
+    print("\n=== Byte Coverage Stats (based on ORIGINAL tokens) ===")
+    print(f"Transcribed bytes      : {out_stats['transcribed_bytes']}")
+    print(f"Not transcribed bytes  : {out_stats['not_transcribed_bytes']}")
+    print(f"Total bytes (counted)  : {out_stats['total_bytes']}")
+    print(f"% transcribed          : {out_stats['pct_transcribed']:.2f}%")
+    print(f"% not transcribed      : {out_stats['pct_not_transcribed']:.2f}%")
+
+    if stats_json_path:
+        with open(stats_json_path, "w", encoding="utf-8") as sf:
+            json.dump(out_stats, sf, ensure_ascii=False, indent=2)
+        print(f"Stats JSON written to: {stats_json_path}")
+
+    return out_stats
+
+
 def main():
     parser = argparse.ArgumentParser(
-        description='Transcribe multilingual text or JSON into IPA phonemes (Korean via espeak-ng).'
+        description='Transcribe multilingual text or JSON into IPA phonemes (Korean via espeak-ng), with byte coverage stats.'
     )
 
     parser.add_argument(
@@ -132,11 +202,20 @@ def main():
         "--wrapper",
         default=False,
         action=argparse.BooleanOptionalAction,
-        help="Wrap unparseable text with [[[[[square brackets]]]]], for later recovery."
+        help="Wrap unparseable/non-target tokens with [[[[[...]]]]]. Use --no-wrapper to leave them unchanged."
+    )
+
+    parser.add_argument(
+        "--stats_json",
+        type=str,
+        default=None,
+        help="Optional: write byte coverage stats as JSON to this path (in addition to printing)."
     )
 
     args = parser.parse_args()
 
+    stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+
     try:
         with open(args.input_file, 'r', encoding='utf-8') as f:
             input_content = f.read()
@@ -145,7 +224,8 @@ def main():
         if args.text_input:
             transcription = transcribe_plain_text(
                 input_content,
-                wrapper=args.wrapper
+                wrapper=args.wrapper,
+                stats=stats
             )
 
             if args.text_output:
@@ -164,11 +244,11 @@ def main():
                 input_content,
                 args.input_json_key,
                 args.output_json_key,
-                wrapper=args.wrapper
+                wrapper=args.wrapper,
+                stats=stats
             )
 
             if updated_json_data:
-                # Default behavior: overwrite original JSON
                 if args.text_output:
                     with open(args.text_output, 'w', encoding='utf-8') as f:
                         f.write(updated_json_data)
@@ -178,6 +258,8 @@ def main():
                         f.write(updated_json_data)
                     print(f"Successfully updated JSON data in '{args.input_file}'")
 
+        finalize_and_print_stats(stats, stats_json_path=args.stats_json)
+
     except FileNotFoundError:
         print(f"Error: Input file '{args.input_file}' not found.")
     except ValueError as e:
diff --git a/data/template/utils/zh_to_ipa.py b/data/template/utils/zh_to_ipa.py
index 387bca3303..987e112487 100644
--- a/data/template/utils/zh_to_ipa.py
+++ b/data/template/utils/zh_to_ipa.py
@@ -9,6 +9,20 @@
 import json
 
 
+WRAP_PREFIX = "[[[[["
+WRAP_SUFFIX = "]]]]]"
+
+
+def utf8_len(s: str) -> int:
+    return len(s.encode("utf-8"))
+
+
+def is_chinese_token(token: str) -> bool:
+    # Keeps your original behavior: "Chinese" means "contains any simplified Hanzi"
+    # (This will miss pure-traditional-only text; expand if you want.)
+    return any(hanzi.is_simplified(ch) for ch in token)
+
+
 def transcribe_chinese(sentence: str) -> str:
     """Transcribe a Chinese sentence into its phonemes using dragonmapper."""
     try:
@@ -20,12 +34,12 @@ def transcribe_chinese(sentence: str) -> str:
 
 def handle_mixed_language(word: str, wrapper: bool = True) -> str:
     """Handle a word with potential Chinese, other language, or number content."""
-    if word.isdigit():  # Detect numbers (pass through unchanged)
+    if word.isdigit():  # numbers: passthrough
         return word
-    elif any(hanzi.is_simplified(char) for char in word):  # Detect Simplified Chinese chars
+    elif is_chinese_token(word):  # Chinese: IPA
         return transcribe_chinese(word)
-    else:  # Non-Chinese word
-        return f"[[[[[{word}]]]]]" if wrapper else word
+    else:  # Non-Chinese: wrap or passthrough
+        return f"{WRAP_PREFIX}{word}{WRAP_SUFFIX}" if wrapper else word
 
 
 def transcribe_multilingual(
@@ -39,84 +53,106 @@ def transcribe_multilingual(
     """
     Transcribe multilingual sentences (Chinese + non-Chinese passthrough/wrap) and save to a file.
 
-    Args:
-        data: The input data (list of dicts if JSON, list of strings if plain text).
-        output_file: Path to the output file.
-        json_inplace_update: If True, process JSON input and add IPA to the same JSON objects.
-        json_input_field: The field in the JSON data to transcribe (default: "sentence").
-        json_output_field: The field to write the IPA transcription to (default: "sentence_ipa").
-        wrapper: If True, wrap non-Chinese tokens like [[[[[token]]]]]. If False, leave them unchanged.
+    Also computes byte counts:
+      - transcribed_bytes: UTF-8 bytes of ORIGINAL tokens that were transcribed (Chinese tokens)
+      - not_transcribed_bytes: UTF-8 bytes of ORIGINAL tokens that were not transcribed
+        (includes Latin words, digits, punctuation, etc.)
+    These counts are based on ORIGINAL text tokens, so wrapper overhead is automatically excluded.
+
+    Returns:
+        stats dict with transcribed_bytes, not_transcribed_bytes, total_bytes, and percents.
     """
+    transcribed_bytes = 0
+    not_transcribed_bytes = 0
+
+    def process_sentence(sentence: str) -> str:
+        nonlocal transcribed_bytes, not_transcribed_bytes
+
+        # Split sentence using jieba (your original behavior)
+        seg_list = jieba.cut(sentence, cut_all=False)
+        seg_sentence = "".join(seg_list)
+
+        # Split but keep punctuation
+        words = re.findall(r"\w+|[^\w\s]", seg_sentence, re.UNICODE)
+
+        out_parts = []
+        for tok in words:
+            tok_bytes = utf8_len(tok)
+
+            if re.match(r"\w+", tok):
+                # word-ish token
+                if tok.isdigit():
+                    not_transcribed_bytes += tok_bytes
+                elif is_chinese_token(tok):
+                    transcribed_bytes += tok_bytes
+                else:
+                    not_transcribed_bytes += tok_bytes
+
+                out_parts.append(handle_mixed_language(tok, wrapper=wrapper))
+            else:
+                # punctuation / symbols
+                not_transcribed_bytes += tok_bytes
+                out_parts.append(tok)
+
+        return " ".join(out_parts)
+
     if json_inplace_update:
         # In-place update for JSON data
         for item in data:
             if json_input_field in item:
                 sentence = item[json_input_field]
-                result = []
-
-                # Split sentence using jieba
-                seg_list = jieba.cut(sentence, cut_all=False)
-                seg_sentence = "".join(seg_list)
+                item[json_output_field] = process_sentence(sentence)
 
-                # Split sentence but keep punctuation
-                words = re.findall(r'\w+|[^\w\s]', seg_sentence, re.UNICODE)
-                for word in words:
-                    if re.match(r'\w+', word):  # Only process words
-                        result.append(handle_mixed_language(word, wrapper=wrapper))
-                    else:
-                        result.append(word)  # Preserve punctuation
-
-                transcription_result = " ".join(result)
-                item[json_output_field] = transcription_result
-
-        with open(output_file, 'w', encoding='utf-8') as f:
+        with open(output_file, "w", encoding="utf-8") as f:
             json.dump(data, f, ensure_ascii=False, indent=4)
         print(f"In-place JSON transcription saved to {output_file}")
 
     else:
-        # Standard transcription (either JSON or plain text to plain text output)
-        with open(output_file, 'w', encoding='utf-8') as f:
+        # Standard transcription to plain text output (one line per item)
+        with open(output_file, "w", encoding="utf-8") as f:
             for item in data:
-                result = []
                 if isinstance(item, dict):
                     sentence = item.get(json_input_field, "")
                 else:
                     sentence = item
 
-                # Split sentence using jieba
-                seg_list = jieba.cut(sentence, cut_all=False)
-                seg_sentence = "".join(seg_list)
+                transcription_result = process_sentence(sentence)
+                f.write(transcription_result + "\n")
+                print(transcription_result)
 
-                # Split sentence but keep punctuation
-                words = re.findall(r'\w+|[^\w\s]', seg_sentence, re.UNICODE)
-                for word in words:
-                    if re.match(r'\w+', word):  # Only process words
-                        result.append(handle_mixed_language(word, wrapper=wrapper))
-                    else:
-                        result.append(word)  # Preserve punctuation
+    total_bytes = transcribed_bytes + not_transcribed_bytes
+    pct_transcribed = (transcribed_bytes / total_bytes * 100.0) if total_bytes else 0.0
+    pct_not = (not_transcribed_bytes / total_bytes * 100.0) if total_bytes else 0.0
 
-                transcription_result = " ".join(result)
-                f.write(transcription_result + "\n")
-                print(transcription_result)  # Print to console
+    stats = {
+        "transcribed_bytes": transcribed_bytes,
+        "not_transcribed_bytes": not_transcribed_bytes,
+        "total_bytes": total_bytes,
+        "pct_transcribed": pct_transcribed,
+        "pct_not_transcribed": pct_not,
+    }
+    return stats
 
 
 def main():
-    parser = argparse.ArgumentParser(description='Transcribe multilingual sentences into IPA phonemes (Chinese via dragonmapper).')
+    parser = argparse.ArgumentParser(
+        description="Transcribe multilingual sentences into IPA phonemes (Chinese via dragonmapper), with byte coverage stats."
+    )
     parser.add_argument(
-        'input_file',
+        "input_file",
         type=str,
-        help='Path to the input file containing sentences in json or text format.'
+        help="Path to the input file containing sentences in json or text format."
     )
     parser.add_argument(
-        'output_file',
+        "output_file",
         type=str,
-        help='Path to the output file for IPA transcription.'
+        help="Path to the output file for IPA transcription."
     )
     parser.add_argument(
-        '--input_type',
+        "--input_type",
         type=str,
-        choices=['json', 'text'],
-        default='json',
+        choices=["json", "text"],
+        default="json",
         help='Type of input file: "json" or "text" (default: json)'
     )
     parser.add_argument(
@@ -140,18 +176,23 @@ def main():
         action=argparse.BooleanOptionalAction,
         help="Wrap non-Chinese tokens as [[[[[...]]]]] (default: true). Use --no-wrapper to leave them unchanged."
     )
+    parser.add_argument(
+        "--stats_json",
+        type=str,
+        default=None,
+        help="Optional: write stats as JSON to this path (in addition to printing)."
+    )
 
     args = parser.parse_args()
 
     try:
-        with open(args.input_file, 'r', encoding='utf-8') as f:
-            if args.input_type == 'json':
+        with open(args.input_file, "r", encoding="utf-8") as f:
+            if args.input_type == "json":
                 data = json.load(f)
             else:
-                # Keep lines as strings; strip newline later if you want
                 data = [line.rstrip("\n") for line in f.readlines()]
 
-        transcribe_multilingual(
+        stats = transcribe_multilingual(
             data=data,
             output_file=args.output_file,
             json_inplace_update=args.json_inplace_update,
@@ -160,6 +201,19 @@ def main():
             wrapper=args.wrapper,
         )
 
+        # Print summary stats (wrapper overhead is automatically excluded because we count ORIGINAL token bytes)
+        print("\n=== Byte Coverage Stats (based on ORIGINAL tokens) ===")
+        print(f"Transcribed bytes      : {stats['transcribed_bytes']}")
+        print(f"Not transcribed bytes  : {stats['not_transcribed_bytes']}")
+        print(f"Total bytes (counted)  : {stats['total_bytes']}")
+        print(f"% transcribed          : {stats['pct_transcribed']:.2f}%")
+        print(f"% not transcribed      : {stats['pct_not_transcribed']:.2f}%")
+
+        if args.stats_json:
+            with open(args.stats_json, "w", encoding="utf-8") as sf:
+                json.dump(stats, sf, ensure_ascii=False, indent=2)
+            print(f"Stats JSON written to: {args.stats_json}")
+
     except FileNotFoundError:
         print(f"Error: Input file '{args.input_file}' not found.")
     except json.JSONDecodeError:
@@ -168,6 +222,6 @@ def main():
         print(f"An unexpected error occurred: {e}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
 

From 21dab6240444b0d2ccb185e25b49473ebd8a71c4 Mon Sep 17 00:00:00 2001
From: klei22 <kauna@avalonwest.tech>
Date: Tue, 23 Dec 2025 18:51:36 -0800
Subject: [PATCH 02/10] Adding stats for en, zh, and ja

---
 data/flores200-res/eng_stats.json | 7 +++++++
 data/flores200-res/ja_stats.json  | 7 +++++++
 data/flores200-res/zh_stats.json  | 7 +++++++
 3 files changed, 21 insertions(+)
 create mode 100644 data/flores200-res/eng_stats.json
 create mode 100644 data/flores200-res/ja_stats.json
 create mode 100644 data/flores200-res/zh_stats.json

diff --git a/data/flores200-res/eng_stats.json b/data/flores200-res/eng_stats.json
new file mode 100644
index 0000000000..abcded9fae
--- /dev/null
+++ b/data/flores200-res/eng_stats.json
@@ -0,0 +1,7 @@
+{
+  "transcribed_bytes": 208634,
+  "not_transcribed_bytes": 7905,
+  "total_bytes": 216539,
+  "pct_transcribed": 96.34938740827288,
+  "pct_not_transcribed": 3.6506125917271253
+}
\ No newline at end of file
diff --git a/data/flores200-res/ja_stats.json b/data/flores200-res/ja_stats.json
new file mode 100644
index 0000000000..30b83d8af6
--- /dev/null
+++ b/data/flores200-res/ja_stats.json
@@ -0,0 +1,7 @@
+{
+  "transcribed_bytes": 305426,
+  "not_transcribed_bytes": 21596,
+  "total_bytes": 327022,
+  "pct_transcribed": 93.3961629492817,
+  "pct_not_transcribed": 6.6038370507183
+}
\ No newline at end of file
diff --git a/data/flores200-res/zh_stats.json b/data/flores200-res/zh_stats.json
new file mode 100644
index 0000000000..eda44595ef
--- /dev/null
+++ b/data/flores200-res/zh_stats.json
@@ -0,0 +1,7 @@
+{
+  "transcribed_bytes": 208747,
+  "not_transcribed_bytes": 27209,
+  "total_bytes": 235956,
+  "pct_transcribed": 88.4686127922155,
+  "pct_not_transcribed": 11.531387207784501
+}
\ No newline at end of file

From 8a5ec691e0688c66d1a9acd191dd5454cdca5db6 Mon Sep 17 00:00:00 2001
From: klei22 <kauna@avalonwest.tech>
Date: Sun, 28 Dec 2025 13:21:43 -0800
Subject: [PATCH 03/10] Add README.md stats and script updates

---
 data/flores200-res/README.md      |  20 ++
 data/flores200-res/get_dataset.sh | 215 +++++++++++++++++++-
 data/flores200-res/ko_stats.json  |   7 +
 data/flores200-res/phoneticize.sh |  23 ++-
 data/template/utils/espeak2ipa.py | 326 +++++++++++++++++-------------
 5 files changed, 446 insertions(+), 145 deletions(-)
 create mode 100644 data/flores200-res/README.md
 create mode 100644 data/flores200-res/ko_stats.json

diff --git a/data/flores200-res/README.md b/data/flores200-res/README.md
new file mode 100644
index 0000000000..aab97633c7
--- /dev/null
+++ b/data/flores200-res/README.md
@@ -0,0 +1,20 @@
+# Scripts compatible with Flores-200 Restructured
+
+This is a folder with scripts compatible with the Flores-200 project, originally
+from:
+https://github.com/facebookresearch/flores/blob/main/README.md
+
+Though scripts target the restructured format proposed by muhammadravi251001:
+https://huggingface.co/datasets/muhammadravi251001/restructured-flores200
+
+# License of dataset
+
+The Flores 200 dataset is licensed under CC-By-SA 4.0.
+
+## Language Codes
+
+Language Codes here for Flore-200:
+https://github.com/facebookresearch/flores/blob/main/flores200/README.md
+
+Language Codes here for espeak (basis of many of the phoneticizers):
+https://espeak.sourceforge.net/languages.html
diff --git a/data/flores200-res/get_dataset.sh b/data/flores200-res/get_dataset.sh
index deda56df65..d95b965614 100644
--- a/data/flores200-res/get_dataset.sh
+++ b/data/flores200-res/get_dataset.sh
@@ -17,9 +17,222 @@ lang_array=(
   "text_jpn_Jpan"
   "text_kor_Hang"
   "text_zho_Hans"
-  "text_shn_Mymr"
+  "text_hin_Deva"
+  "text_vie_Latn"
+  "text_ind_Latn"
+  "text_swh_Latn"
+  "text_ell_Grek"
+  "text_fra_Latn"
 )
 
+# lang_array=(
+#   "text_ace_Arab"
+#   "text_ace_Latn"
+#   "text_acm_Arab"
+#   "text_acq_Arab"
+#   "text_aeb_Arab"
+#   "text_afr_Latn"
+#   "text_ajp_Arab"
+#   "text_aka_Latn"
+#   "text_als_Latn"
+#   "text_amh_Ethi"
+#   "text_apc_Arab"
+#   "text_arb_Arab"
+#   "text_arb_Latn"
+#   "text_ars_Arab"
+#   "text_ary_Arab"
+#   "text_arz_Arab"
+#   "text_asm_Beng"
+#   "text_ast_Latn"
+#   "text_awa_Deva"
+#   "text_ayr_Latn"
+#   "text_azb_Arab"
+#   "text_azj_Latn"
+#   "text_bak_Cyrl"
+#   "text_bam_Latn"
+#   "text_ban_Latn"
+#   "text_bel_Cyrl"
+#   "text_bem_Latn"
+#   "text_ben_Beng"
+#   "text_bho_Deva"
+#   "text_bjn_Arab"
+#   "text_bjn_Latn"
+#   "text_bod_Tibt"
+#   "text_bos_Latn"
+#   "text_bug_Latn"
+#   "text_bul_Cyrl"
+#   "text_cat_Latn"
+#   "text_ceb_Latn"
+#   "text_ces_Latn"
+#   "text_cjk_Latn"
+#   "text_ckb_Arab"
+#   "text_crh_Latn"
+#   "text_cym_Latn"
+#   "text_dan_Latn"
+#   "text_deu_Latn"
+#   "text_dik_Latn"
+#   "text_dyu_Latn"
+#   "text_dzo_Tibt"
+#   "text_ell_Grek"
+#   "text_eng_Latn"
+#   "text_epo_Latn"
+#   "text_est_Latn"
+#   "text_eus_Latn"
+#   "text_ewe_Latn"
+#   "text_fao_Latn"
+#   "text_fij_Latn"
+#   "text_fin_Latn"
+#   "text_fon_Latn"
+#   "text_fra_Latn"
+#   "text_fur_Latn"
+#   "text_fuv_Latn"
+#   "text_gaz_Latn"
+#   "text_gla_Latn"
+#   "text_gle_Latn"
+#   "text_glg_Latn"
+#   "text_grn_Latn"
+#   "text_guj_Gujr"
+#   "text_hat_Latn"
+#   "text_hau_Latn"
+#   "text_heb_Hebr"
+#   "text_hin_Deva"
+#   "text_hne_Deva"
+#   "text_hrv_Latn"
+#   "text_hun_Latn"
+#   "text_hye_Armn"
+#   "text_ibo_Latn"
+#   "text_ilo_Latn"
+#   "text_ind_Latn"
+#   "text_isl_Latn"
+#   "text_ita_Latn"
+#   "text_jav_Latn"
+#   "text_jpn_Jpan"
+#   "text_kab_Latn"
+#
+#   "text_kac_Latn"
+#   "text_kam_Latn"
+#   "text_kan_Knda"
+#   "text_kas_Arab"
+#   "text_kas_Deva"
+#   "text_kat_Geor"
+#   "text_kaz_Cyrl"
+#   "text_kbp_Latn"
+#   "text_kea_Latn"
+#   "text_khk_Cyrl"
+#   "text_khm_Khmr"
+#   "text_kik_Latn"
+#   "text_kin_Latn"
+#   "text_kir_Cyrl"
+#   "text_kmb_Latn"
+#   "text_kmr_Latn"
+#   "text_knc_Arab"
+#   "text_knc_Latn"
+#   "text_kon_Latn"
+#   "text_kor_Hang"
+#   "text_lao_Laoo"
+#   "text_lij_Latn"
+#   "text_lim_Latn"
+#   "text_lin_Latn"
+#   "text_lit_Latn"
+#   "text_lmo_Latn"
+#   "text_ltg_Latn"
+#   "text_ltz_Latn"
+#   "text_lua_Latn"
+#   "text_lug_Latn"
+#   "text_luo_Latn"
+#   "text_lus_Latn"
+#   "text_lvs_Latn"
+#   "text_mag_Deva"
+#   "text_mai_Deva"
+#   "text_mal_Mlym"
+#   "text_mar_Deva"
+#   "text_min_Arab"
+#   "text_min_Latn"
+#   "text_mkd_Cyrl"
+#   "text_mlt_Latn"
+#   "text_mni_Beng"
+#   "text_mos_Latn"
+#   "text_mri_Latn"
+#   "text_mya_Mymr"
+#   "text_nld_Latn"
+#   "text_nno_Latn"
+#   "text_nob_Latn"
+#   "text_npi_Deva"
+#   "text_nso_Latn"
+#   "text_nus_Latn"
+#   "text_nya_Latn"
+#   "text_oci_Latn"
+#   "text_ory_Orya"
+#   "text_pag_Latn"
+#   "text_pan_Guru"
+#   "text_pap_Latn"
+#   "text_pbt_Arab"
+#   "text_pes_Arab"
+#   "text_plt_Latn"
+#   "text_pol_Latn"
+#   "text_por_Latn"
+#   "text_prs_Arab"
+#   "text_quy_Latn"
+#   "text_ron_Latn"
+#   "text_run_Latn"
+#   "text_rus_Cyrl"
+#   "text_sag_Latn"
+#   "text_san_Deva"
+#   "text_sat_Olck"
+#   "text_scn_Latn"
+#   "text_shn_Mymr"
+#   "text_sin_Sinh"
+#   "text_slk_Latn"
+#   "text_slv_Latn"
+#   "text_smo_Latn"
+#   "text_sna_Latn"
+#   "text_snd_Arab"
+#   "text_som_Latn"
+#   "text_sot_Latn"
+#   "text_spa_Latn"
+#   "text_srd_Latn"
+#   "text_srp_Cyrl"
+#   "text_ssw_Latn"
+#   "text_sun_Latn"
+#   "text_swe_Latn"
+#   "text_swh_Latn"
+#   "text_szl_Latn"
+#   "text_tam_Taml"
+#   "text_taq_Latn"
+#   "text_taq_Tfng"
+#   "text_tat_Cyrl"
+#   "text_tel_Telu"
+#   "text_tgk_Cyrl"
+#   "text_tgl_Latn"
+#   "text_tha_Thai"
+#   "text_tir_Ethi"
+#   "text_tpi_Latn"
+#   "text_tsn_Latn"
+#   "text_tso_Latn"
+#   "text_tuk_Latn"
+#   "text_tum_Latn"
+#   "text_tur_Latn"
+#   "text_twi_Latn"
+#   "text_tzm_Tfng"
+#   "text_uig_Arab"
+#   "text_ukr_Cyrl"
+#   "text_umb_Latn"
+#   "text_urd_Arab"
+#   "text_uzn_Latn"
+#   "text_vec_Latn"
+#   "text_vie_Latn"
+#   "text_war_Latn"
+#   "text_wol_Latn"
+#   "text_xho_Latn"
+#   "text_ydd_Hebr"
+#   "text_yor_Latn"
+#   "text_yue_Hant"
+#   "text_zho_Hans"
+#   "text_zho_Hant"
+#   "text_zsm_Latn"
+#   "text_zul_Latn"
+# )
+
 # Add url with dataset here:
 url="https://huggingface.co/datasets/muhammadravi251001/restructured-flores200/tree/main/data"
 
diff --git a/data/flores200-res/ko_stats.json b/data/flores200-res/ko_stats.json
new file mode 100644
index 0000000000..5330f63aca
--- /dev/null
+++ b/data/flores200-res/ko_stats.json
@@ -0,0 +1,7 @@
+{
+  "transcribed_bytes": 271690,
+  "not_transcribed_bytes": 8833,
+  "total_bytes": 280523,
+  "pct_transcribed": 96.85123857936783,
+  "pct_not_transcribed": 3.148761420632176
+}
\ No newline at end of file
diff --git a/data/flores200-res/phoneticize.sh b/data/flores200-res/phoneticize.sh
index b52f0c2789..98fee2e617 100644
--- a/data/flores200-res/phoneticize.sh
+++ b/data/flores200-res/phoneticize.sh
@@ -1,9 +1,22 @@
 #!/bin/bash
 
 
-python3 utils/en2ipa.py text_eng_Latn.txt --mode text --output_file ipa_text_eng_Latn.txt --no-wrapper --stats_json eng_stats.json
-python3 utils/ja2ipa.py text_jpn_Jpan.txt ipa_text_jpn_Jpan.txt --text_output --use_spacy --text_no_sentence --stats_json ja_stats.json
-python3 utils/ko_en_to_ipa.py text_kor_Hang.txt --text_input --text_output ipa_text_kor_Hang.txt --stats_json ko_stats.json
-python3 utils/zh_to_ipa.py text_zho_Hans.txt ipa_text_zho_Hans.txt --input_type text --no-wrapper --stats_json zh_stats.json
-python3 utils/espeak2ipa.py text_shn_Mymr.txt --mode text --output_file ipa_text_shan.txt --no-wrapper --stats_json shan_stats.json --lang shan
+# python3 utils/en2ipa.py text_eng_Latn.txt --mode text --output_file ipa_text_eng_Latn.txt --no-wrapper --stats_json eng_stats.json
+# python3 utils/ja2ipa.py text_jpn_Jpan.txt ipa_text_jpn_Jpan.txt --text_output --use_spacy --text_no_sentence --stats_json ja_stats.json
+# python3 utils/ko_en_to_ipa.py text_kor_Hang.txt --text_input --text_output ipa_text_kor_Hang.txt --stats_json ko_stats.json
+# python3 utils/zh_to_ipa.py text_zho_Hans.txt ipa_text_zho_Hans.txt --input_type text --no-wrapper --stats_json zh_stats.json
 
+lang_array=(
+  "text_vie_Latn:vi"
+  "text_ind_Latn:id"
+  "text_swh_Latn:sw"
+  "text_ell_Grek:el"
+  "text_fra_Latn:fr"
+)
+
+for lang in "${lang_array[@]}"; do
+  text_file="${lang%%:*}"
+  two_letter_code="${lang##*:}"
+  echo "${text_file}; ${two_letter_code}"
+  python3 utils/espeak2ipa.py "$text_file".txt --mode text --output_file ipa_"$text_file".txt --no-wrapper --stats_json stats_"$text_file".json --lang "$two_letter_code" --text_no_sentence
+done
diff --git a/data/template/utils/espeak2ipa.py b/data/template/utils/espeak2ipa.py
index 9c2864afc0..0e017747fa 100644
--- a/data/template/utils/espeak2ipa.py
+++ b/data/template/utils/espeak2ipa.py
@@ -2,27 +2,38 @@
 # espeak2ipa.py
 #
 # Generic IPA transcription using espeak-ng for ANY supported voice.
-# Defaults to "shan" (you can override with --lang).
+# Defaults to "shn" (override with --lang).
 #
-# Features (modeled after your en2ipa.py):
-# - JSON list mode (--mode json): in-place update of a JSON list file
-# - Text mode (--mode text): line-by-line transcription to output file (or overwrite input)
+# Features:
+# - JSON list mode (--mode json):
+#     - default: overwrite input JSON file adding output_json_key per item
+#     - with --text_output: emit a text file (sentence<sep>ipa OR ipa-only via --text_no_sentence)
+# - Text mode (--mode text): input is one sentence per line
+#     - default: emits IPA-only (backward-compatible with your existing espeak2ipa.py)
+#     - with --text_output: emits sentence<sep>ipa (JP-like), unless --text_no_sentence
 # - Optional wrapping for untranscribed/unparseable tokens: [[[[[...]]]]]
 # - Multithreading with ordered output
 # - Rich progress bar
 # - Byte coverage stats (based on ORIGINAL tokens; wrapper overhead excluded)
 #
 # Notes:
-# - "transcribed_bytes" counts bytes of ORIGINAL tokens we ATTEMPT to send to espeak
+# - "transcribed_bytes" counts UTF-8 bytes of ORIGINAL tokens we ATTEMPT to send to espeak
 #   (tokens that contain at least one Unicode letter). Digits/punct count as not_transcribed.
-# - espeak-ng voices vary; if a voice is unavailable, you'll get an error/empty output.
+# - If espeak-ng outputs empty text for a token, we treat it as "unparseable" and optionally wrap it.
 
 import subprocess
 import argparse
 import re
 import json
-from typing import List, Tuple, Optional, Dict, Any
-from rich.progress import Progress, BarColumn, TextColumn, TimeRemainingColumn, TimeElapsedColumn, MofNCompleteColumn
+from typing import List, Optional, Dict, Any, Tuple
+from rich.progress import (
+    Progress,
+    BarColumn,
+    TextColumn,
+    TimeRemainingColumn,
+    TimeElapsedColumn,
+    MofNCompleteColumn,
+)
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import os
 import threading
@@ -55,7 +66,7 @@ def transcribe_espeak(token: str, lang: str, wrapper: bool = False) -> str:
         result = subprocess.run(
             ["espeak-ng", "-q", "-v", lang, "--ipa", token],
             capture_output=True,
-            text=True
+            text=True,
         )
         out = (result.stdout or "").strip().replace("ㆍ", " ")
         if not out:
@@ -97,12 +108,18 @@ def tokens_to_ipa_string(tokens: List[str], lang: str, wrapper: bool) -> str:
     return " ".join(out)
 
 
-def _worker_sentence(sentence: str, lang: str, wrapper: bool, stats: Optional[Dict[str, int]] = None) -> str:
+def _worker_sentence(
+    sentence: str,
+    lang: str,
+    wrapper: bool,
+    stats: Optional[Dict[str, int]] = None,
+) -> str:
     """
     Tokenize & transcribe one sentence/line.
+
     If stats is provided, updates byte counts based on ORIGINAL tokens:
       - transcribed_bytes: tokens containing at least one letter
-      - not_transcribed_bytes: digits + punctuation/symbols + other \w tokens with no letters
+      - not_transcribed_bytes: digits + punctuation/symbols + other \\w tokens with no letters
     """
     tokens = _WORD_RE.findall(sentence)
 
@@ -134,97 +151,19 @@ def _progress() -> Progress:
     )
 
 
-def transcribe_json_list(
-    json_text_or_obj,
-    input_json_key: str,
-    output_json_key: str,
-    lang: str,
-    wrapper: bool,
-    multithread: bool,
-    workers: int,
-    stats: Optional[Dict[str, int]] = None,
-) -> Optional[str]:
-    """
-    JSON list mode: reads a JSON list of objects, writes output_json_key for each object.
-    Returns JSON string (pretty printed).
-    """
-    try:
-        data = json.loads(json_text_or_obj) if isinstance(json_text_or_obj, str) else json_text_or_obj
-        if not isinstance(data, list):
-            raise ValueError("JSON data should be a list of objects.")
-    except Exception as e:
-        print(f"Error: {e}")
-        return None
-
-    n = len(data)
-    if stats is None:
-        stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
-    else:
-        stats.setdefault("transcribed_bytes", 0)
-        stats.setdefault("not_transcribed_bytes", 0)
-
-    if n == 0:
-        return json.dumps(data, ensure_ascii=False, indent=4)
-
-    if not multithread or workers <= 1:
-        with _progress() as progress:
-            task = progress.add_task("Processing JSON items", total=n)
-            for item in data:
-                if input_json_key in item:
-                    sentence = item[input_json_key]
-                    item[output_json_key] = _worker_sentence(sentence, lang=lang, wrapper=wrapper, stats=stats)
-                progress.update(task, advance=1)
-    else:
-        # ordered results
-        results: List[Tuple[int, str]] = [None] * n  # type: ignore
-        per_item_stats: List[Dict[str, int]] = [None] * n  # type: ignore
-
-        jobs = [(i, data[i].get(input_json_key, "")) for i in range(n)]
-
-        with _progress() as progress:
-            task = progress.add_task(f"Processing JSON items (mt x{workers})", total=n)
-            with ThreadPoolExecutor(max_workers=workers) as ex:
-                future_to_idx = {}
-                for idx, sentence in jobs:
-                    local_stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
-                    per_item_stats[idx] = local_stats
-                    fut = ex.submit(_worker_sentence, sentence, lang, wrapper, local_stats)
-                    future_to_idx[fut] = idx
-
-                for fut in as_completed(future_to_idx):
-                    idx = future_to_idx[fut]
-                    try:
-                        res = fut.result()
-                    except Exception as e:
-                        res = f"Error: {e}"
-                    results[idx] = (idx, res)
-                    progress.update(task, advance=1)
-
-        # merge stats
-        for st in per_item_stats:
-            stats["transcribed_bytes"] += st.get("transcribed_bytes", 0)
-            stats["not_transcribed_bytes"] += st.get("not_transcribed_bytes", 0)
-
-        # write back in original order
-        for idx, item in enumerate(data):
-            if input_json_key in item:
-                item[output_json_key] = results[idx][1]
-
-    return json.dumps(data, ensure_ascii=False, indent=4)
-
-
-def transcribe_text_lines(
-    lines: List[str],
+def transcribe_sentences(
+    sentences: List[str],
     lang: str,
     wrapper: bool,
     multithread: bool,
     workers: int,
     stats: Optional[Dict[str, int]] = None,
+    progress_label: str = "Processing",
 ) -> List[str]:
     """
-    Text mode: input is one sentence per line. Output is one IPA line per input line.
+    Transcribe a list of sentences into IPA, returning results in the same order.
     """
-    n = len(lines)
+    n = len(sentences)
     if stats is None:
         stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
     else:
@@ -235,41 +174,52 @@ def transcribe_text_lines(
         return []
 
     if not multithread or workers <= 1:
-        out_lines: List[str] = []
+        out: List[str] = []
         with _progress() as progress:
-            task = progress.add_task("Processing text lines", total=n)
-            for line in lines:
-                raw = line.rstrip("\n")
-                out_lines.append(_worker_sentence(raw, lang=lang, wrapper=wrapper, stats=stats))
+            task = progress.add_task(progress_label, total=n)
+            for s in sentences:
+                out.append(_worker_sentence(s, lang=lang, wrapper=wrapper, stats=stats))
                 progress.update(task, advance=1)
-        return out_lines
+        return out
 
-    out_lines: List[str] = [None] * n  # type: ignore
+    # Multithreaded path: per-item stats then merge at end
+    out: List[str] = ["" for _ in range(n)]
     per_item_stats: List[Dict[str, int]] = [None] * n  # type: ignore
 
     with _progress() as progress:
-        task = progress.add_task(f"Processing text lines (mt x{workers})", total=n)
+        task = progress.add_task(f"{progress_label} (mt x{workers})", total=n)
         with ThreadPoolExecutor(max_workers=workers) as ex:
             future_to_idx = {}
-            for i in range(n):
+            for i, s in enumerate(sentences):
                 local_stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
                 per_item_stats[i] = local_stats
-                fut = ex.submit(_worker_sentence, lines[i].rstrip("\n"), lang, wrapper, local_stats)
+                fut = ex.submit(_worker_sentence, s, lang, wrapper, local_stats)
                 future_to_idx[fut] = i
 
             for fut in as_completed(future_to_idx):
                 i = future_to_idx[fut]
                 try:
-                    out_lines[i] = fut.result()
+                    out[i] = fut.result()
                 except Exception as e:
-                    out_lines[i] = f"Error: {e}"
+                    out[i] = f"Error: {e}"
                 progress.update(task, advance=1)
 
     for st in per_item_stats:
         stats["transcribed_bytes"] += st.get("transcribed_bytes", 0)
         stats["not_transcribed_bytes"] += st.get("not_transcribed_bytes", 0)
 
-    return out_lines
+    return out
+
+
+def format_text_lines(
+    sentences: List[str],
+    ipa_lines: List[str],
+    include_sentence: bool,
+    sep: str,
+) -> List[str]:
+    if not include_sentence:
+        return ipa_lines
+    return [f"{s}{sep}{ipa}" for s, ipa in zip(sentences, ipa_lines)]
 
 
 def finalize_and_print_stats(stats: Dict[str, int], stats_json_path: Optional[str] = None) -> Dict[str, Any]:
@@ -306,42 +256,99 @@ def finalize_and_print_stats(stats: Dict[str, int], stats_json_path: Optional[st
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Generic IPA transcription using espeak-ng for any supported voice (default: shan). "
-                    "Supports JSON list mode and plain-text line mode, with byte coverage stats."
+        description=(
+            "Generic IPA transcription using espeak-ng for any supported voice (default: shn). "
+            "Supports JSON list mode and plain-text line mode, with byte coverage stats.\n\n"
+            "NEW: --text_output and --text_no_sentence (JP-style) to optionally emit only IPA."
+        )
     )
     parser.add_argument("input_file", type=str, help="Path to the input file (JSON list or plain text).")
 
     # Language / voice
-    parser.add_argument("--lang", default="shan",
-                        help="espeak-ng voice/language code (default: shan). Example: en, fr, de, es, ja, zh, etc.")
+    parser.add_argument(
+        "--lang",
+        default="shn",
+        help="espeak-ng voice/language code (default: shn). Example: en, fr, de, es, ja, zh, etc.",
+    )
 
     # Mode selection
-    parser.add_argument("--mode", choices=["json", "text"], default="json",
-                        help='Processing mode. "json" expects a JSON list; "text" treats file as plain text.')
+    parser.add_argument(
+        "--mode",
+        choices=["json", "text"],
+        default="json",
+        help='Processing mode. "json" expects a JSON list; "text" treats file as plain text.',
+    )
 
     # JSON mode params
-    parser.add_argument("--input_json_key", type=str,
-                        help="JSON key to read sentences from (required for --mode json).")
-    parser.add_argument("--output_json_key", type=str, default="ipa",
-                        help='JSON key to store IPA (default: "ipa").')
+    parser.add_argument(
+        "--input_json_key",
+        type=str,
+        help="JSON key to read sentences from (required for --mode json).",
+    )
+    parser.add_argument(
+        "--output_json_key",
+        type=str,
+        default="ipa",
+        help='JSON key to store IPA (default: "ipa").',
+    )
 
-    # Text mode params
-    parser.add_argument("--output_file", type=str, default=None,
-                        help="Output file path for text mode. Defaults to overwriting input.")
+    # Output path (used for text outputs; in JSON update mode we overwrite input_file)
+    parser.add_argument(
+        "--output_file",
+        type=str,
+        default=None,
+        help="Output file path for text outputs. In --mode text, defaults to overwriting input.",
+    )
 
     # Wrapper option
-    parser.add_argument("--wrapper", default=False, action=argparse.BooleanOptionalAction,
-                        help="Wrap unparseable tokens with [[[[[...]]]]] (default: false).")
+    parser.add_argument(
+        "--wrapper",
+        default=False,
+        action=argparse.BooleanOptionalAction,
+        help="Wrap unparseable tokens with [[[[[...]]]]] (default: false).",
+    )
 
     # Multithreading options
-    parser.add_argument("--multithread", default=False, action=argparse.BooleanOptionalAction,
-                        help="Enable multithreading while preserving output order.")
-    parser.add_argument("--workers", type=int, default=os.cpu_count() or 4,
-                        help="Number of worker threads when --multithread is enabled (default: CPU count).")
+    parser.add_argument(
+        "--multithread",
+        default=False,
+        action=argparse.BooleanOptionalAction,
+        help="Enable multithreading while preserving output order.",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=os.cpu_count() or 4,
+        help="Number of worker threads when --multithread is enabled (default: CPU count).",
+    )
 
     # Stats output
-    parser.add_argument("--stats_json", type=str, default=None,
-                        help="Optional: write byte coverage stats as JSON to this path (in addition to printing).")
+    parser.add_argument(
+        "--stats_json",
+        type=str,
+        default=None,
+        help="Optional: write byte coverage stats as JSON to this path (in addition to printing).",
+    )
+
+    # NEW: JP-style text emission controls
+    parser.add_argument(
+        "--text_output",
+        action="store_true",
+        help=(
+            "Emit text output lines instead of JSON update in --mode json. "
+            'In --mode text, when set, emit "sentence<TAB>ipa" lines (unless --text_no_sentence).'
+        ),
+    )
+    parser.add_argument(
+        "--text_no_sentence",
+        action="store_true",
+        help="In text output mode, emit only the IPA (omit the original sentence).",
+    )
+    parser.add_argument(
+        "--text_sep",
+        default="\t",
+        help='Separator used between sentence and IPA in text output mode (default: tab).',
+    )
 
     args = parser.parse_args()
 
@@ -357,40 +364,79 @@ def main():
                 raise ValueError("--input_json_key is required when --mode json")
 
             with open(args.input_file, "r", encoding="utf-8") as f:
-                input_content = f.read()
+                data = json.load(f)
+
+            if not isinstance(data, list):
+                raise ValueError("JSON data should be a list of objects.")
 
-            updated_json = transcribe_json_list(
-                input_content,
-                input_json_key=args.input_json_key,
-                output_json_key=args.output_json_key,
+            # collect sentences (only items that contain input_json_key)
+            indices: List[int] = []
+            sentences: List[str] = []
+            for i, item in enumerate(data):
+                if isinstance(item, dict) and args.input_json_key in item:
+                    indices.append(i)
+                    sentences.append(str(item[args.input_json_key]))
+
+            ipa_lines = transcribe_sentences(
+                sentences,
                 lang=args.lang,
                 wrapper=args.wrapper,
                 multithread=args.multithread,
                 workers=args.workers,
                 stats=stats,
+                progress_label="Processing JSON items",
             )
-            if updated_json is not None:
-                # matches your existing style: overwrite JSON input file
+
+            if args.text_output:
+                include_sentence = not args.text_no_sentence
+                out_lines = format_text_lines(sentences, ipa_lines, include_sentence, args.text_sep)
+
+                target_path = args.output_file
+                if not target_path:
+                    target_path = args.input_file + ".ipa.txt"
+
+                with open(target_path, "w", encoding="utf-8") as f:
+                    f.write("\n".join(out_lines) + ("\n" if out_lines else ""))
+
+                print(f"✅ Successfully wrote text output to '{target_path}'")
+            else:
+                # default behavior: update JSON in-place (overwrite input_file)
+                for idx, ipa in zip(indices, ipa_lines):
+                    data[idx][args.output_json_key] = ipa
+
                 with open(args.input_file, "w", encoding="utf-8") as f:
-                    f.write(updated_json)
+                    json.dump(data, f, ensure_ascii=False, indent=4)
+
                 print(f"✅ Successfully updated JSON data in '{args.input_file}'")
 
         else:
+            # ---- TEXT MODE ----
             with open(args.input_file, "r", encoding="utf-8") as f:
-                lines = f.readlines()
+                raw_lines = f.readlines()
+
+            sentences = [ln.rstrip("\n") for ln in raw_lines]
 
-            out_lines = transcribe_text_lines(
-                lines,
+            ipa_lines = transcribe_sentences(
+                sentences,
                 lang=args.lang,
                 wrapper=args.wrapper,
                 multithread=args.multithread,
                 workers=args.workers,
                 stats=stats,
+                progress_label="Processing text lines",
             )
 
+            if args.text_output:
+                include_sentence = not args.text_no_sentence
+                out_lines = format_text_lines(sentences, ipa_lines, include_sentence, args.text_sep)
+            else:
+                # backward-compatible default: IPA-only
+                out_lines = ipa_lines
+
             target_path = args.output_file if args.output_file else args.input_file
             with open(target_path, "w", encoding="utf-8") as f:
                 f.write("\n".join(out_lines) + ("\n" if out_lines else ""))
+
             print(f"✅ Successfully wrote transcribed text to '{target_path}'")
 
         finalize_and_print_stats(stats, stats_json_path=args.stats_json)
@@ -399,6 +445,8 @@ def main():
         print(f"Error: Input file '{args.input_file}' not found.")
     except ValueError as ve:
         print(f"Error: {ve}")
+    except json.JSONDecodeError:
+        print(f"Error: Invalid JSON format in '{args.input_file}'.")
 
 
 if __name__ == "__main__":

From 8e07a234d62312ded41c9c608380350ab7f969f2 Mon Sep 17 00:00:00 2001
From: klei22 <kauna@avalonwest.tech>
Date: Sat, 3 Jan 2026 10:15:37 -0800
Subject: [PATCH 04/10] Add Yue

---
 data/flores200-res/get_dataset.sh | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/data/flores200-res/get_dataset.sh b/data/flores200-res/get_dataset.sh
index d95b965614..b3096de376 100644
--- a/data/flores200-res/get_dataset.sh
+++ b/data/flores200-res/get_dataset.sh
@@ -1,22 +1,12 @@
 #!/bin/bash
 
-### Instructions:
-# 1. Replace "INSERT_URL_WITH_FILES" with the actual URL to the Parquet files.
-# 2. Modify the "include_keys" array to specify the keys you want to include in the output.
-# 3. (Optionally) Modify the "value_prefixes" array to set prefixes for each value, use "" for empty prefixes
-# 4. Set "--skip_empty" to true if you want to skip empty fields, or false if not needed.
-# 5. Set "--no_output_text" to true if you plan to process the intermediate json files in a custom manner.
-# 6. For CSV files with BOM headers, pass "--input_encoding utf-8-sig" to the helper script.
-# 7. For CSV cells that contain multi-line text, use "--split_multiline_values" to emit one line per entry or
-#    "--newline_replacement" to substitute newline characters with custom text.
-
-# Run the Python script with the specified arguments
 
 lang_array=(
   "text_eng_Latn"
   "text_jpn_Jpan"
   "text_kor_Hang"
   "text_zho_Hans"
+  "text_yue_Hant"
   "text_hin_Deva"
   "text_vie_Latn"
   "text_ind_Latn"
@@ -242,5 +232,4 @@ for lang in "${lang_array[@]}"; do
     --include_keys "$lang" \
     --value_prefix $'\n' \
     --output_text_file "$lang".txt
-
 done

From 8346e3a60804637850e65354f3ba3347307f1d59 Mon Sep 17 00:00:00 2001
From: klei22 <kauna@avalonwest.tech>
Date: Sat, 3 Jan 2026 10:16:35 -0800
Subject: [PATCH 05/10] Add Yue to get dataset

---
 data/flores200-res/phoneticize.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/data/flores200-res/phoneticize.sh b/data/flores200-res/phoneticize.sh
index 98fee2e617..2552a922ff 100644
--- a/data/flores200-res/phoneticize.sh
+++ b/data/flores200-res/phoneticize.sh
@@ -12,11 +12,14 @@ lang_array=(
   "text_swh_Latn:sw"
   "text_ell_Grek:el"
   "text_fra_Latn:fr"
+  "text_yue_Hant:yue"
 )
 
 for lang in "${lang_array[@]}"; do
   text_file="${lang%%:*}"
   two_letter_code="${lang##*:}"
   echo "${text_file}; ${two_letter_code}"
-  python3 utils/espeak2ipa.py "$text_file".txt --mode text --output_file ipa_"$text_file".txt --no-wrapper --stats_json stats_"$text_file".json --lang "$two_letter_code" --text_no_sentence
+  if [ ! -f "ipa_${text_file}.txt" ]; then
+    python3 utils/espeak2ipa.py "$text_file".txt --mode text --output_file ipa_"$text_file".txt --no-wrapper --stats_json stats_"$text_file".json --lang "$two_letter_code" --text_no_sentence
+  fi
 done

From fe33152876404b22594cbc577600c70bfa839ef8 Mon Sep 17 00:00:00 2001
From: klei22 <kauna@avalonwest.tech>
Date: Sat, 3 Jan 2026 11:04:16 -0800
Subject: [PATCH 06/10] Add graphs for grouping bytes of languages

---
 data/flores200-res/graphs.sh                  |   9 +
 .../plot_langscript_sizes_grouped.py          | 216 ++++++++++++++++++
 2 files changed, 225 insertions(+)
 create mode 100644 data/flores200-res/graphs.sh
 create mode 100644 data/flores200-res/plot_langscript_sizes_grouped.py

diff --git a/data/flores200-res/graphs.sh b/data/flores200-res/graphs.sh
new file mode 100644
index 0000000000..aabf067fda
--- /dev/null
+++ b/data/flores200-res/graphs.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+python3 plot_langscript_sizes_grouped.py --group-by script --color-by script --out by_script.png
+python3 plot_langscript_sizes_grouped.py --group-by script --color-by region --out by_region_script.png
+python3 plot_langscript_sizes_grouped.py --group-by region --color-by region --out by_region.png
+python3 plot_langscript_sizes_grouped.py --group-by family --color-by family --out by_family.png
+python3 plot_langscript_sizes_grouped.py --group-by family --color-by script --out by_family_script.png
+
+
+
diff --git a/data/flores200-res/plot_langscript_sizes_grouped.py b/data/flores200-res/plot_langscript_sizes_grouped.py
new file mode 100644
index 0000000000..c1462018c1
--- /dev/null
+++ b/data/flores200-res/plot_langscript_sizes_grouped.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+import json
+import re
+from collections import Counter, defaultdict
+import matplotlib.pyplot as plt
+
+# text_<lang>_<script>.txt
+FNAME_RE = re.compile(r"^text_([a-z]{3})_([A-Za-z]{4})\.txt$")
+
+def parse_size_to_kb(size_str: str) -> float:
+    m = re.match(r"^\s*([0-9]*\.?[0-9]+)\s*([KMGTP]?)(B?)\s*$", size_str, re.IGNORECASE)
+    if not m:
+        raise ValueError(f"Bad size: {size_str!r}")
+    val = float(m.group(1))
+    unit = m.group(2).upper()
+    mult = {
+        "": 1 / 1024,
+        "K": 1,
+        "M": 1024,
+        "G": 1024**2,
+        "T": 1024**3,
+        "P": 1024**4,
+    }[unit]
+    return val * mult
+
+def script_to_region(script: str) -> str:
+    mena = {"Arab", "Hebr"}
+    south_asia = {"Deva", "Beng", "Gujr", "Guru", "Orya", "Knda", "Mlym"}
+    east_asia = {"Jpan", "Hang", "Hani", "Hans", "Hant"}
+    se_asia = {"Khmr", "Laoo", "Mymr", "Thai"}
+    eurasia = {"Cyrl", "Grek", "Armn", "Geor"}
+    horn_africa = {"Ethi"}
+    himalaya = {"Tibt"}
+
+    if script in mena:
+        return "MENA"
+    if script in south_asia:
+        return "South Asia"
+    if script in east_asia:
+        return "East Asia"
+    if script in se_asia:
+        return "Southeast Asia"
+    if script in eurasia:
+        return "Eurasia"
+    if script in horn_africa:
+        return "Horn of Africa"
+    if script in himalaya:
+        return "Himalaya"
+    if script == "Latn":
+        return "Latin (global)"
+    return f"Other ({script})"
+
+def script_to_family(script: str) -> str:
+    """
+    Coarser “language grouping” / writing-system-family buckets.
+
+    Examples requested:
+      - Arab + Hebr + Ethi -> one group (Semitic scripts)
+      - Hans + Hant (+Hani) -> one group (Han scripts)
+    """
+    semitic_scripts = {"Arab", "Hebr", "Ethi"}  # per your request (Amharic uses Ethiopic)
+    han_scripts = {"Hans", "Hant", "Hani"}
+    japanese = {"Jpan"}      # could fold into Han if you want, but keeping separate by default
+    korean = {"Hang"}        # separate by default
+    indic = {"Deva", "Beng", "Gujr", "Guru", "Orya", "Knda", "Mlym", "Taml", "Telu", "Sinh"}
+    se_asia = {"Khmr", "Laoo", "Mymr", "Thai"}
+    cyrillic = {"Cyrl"}
+    greek = {"Grek"}
+    caucasus = {"Armn", "Geor"}
+    tibetan = {"Tibt"}
+
+    if script in semitic_scripts:
+        return "Semitic scripts (Arab/Hebr/Ethi)"
+    if script in han_scripts:
+        return "Han scripts (Hans/Hant/Hani)"
+    if script in japanese:
+        return "Japanese (Jpan)"
+    if script in korean:
+        return "Korean (Hang)"
+    if script in indic:
+        return "Indic scripts"
+    if script in se_asia:
+        return "Mainland SEA scripts"
+    if script in cyrillic:
+        return "Cyrillic"
+    if script in greek:
+        return "Greek"
+    if script in caucasus:
+        return "Caucasus scripts (Armn/Geor)"
+    if script in tibetan:
+        return "Tibetan (Tibt)"
+    if script == "Latn":
+        return "Latin"
+    return f"Other ({script})"
+
+def main():
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--json", default="files.json", help="Input JSON listing")
+    ap.add_argument("--out", default=None, help="Save figure to this path (png/pdf/etc)")
+    ap.add_argument("--top-n", type=int, default=0, help="If >0, plot only top N entries by KB")
+    ap.add_argument("--group-by", choices=["region", "script", "family"], default="region",
+                    help="How to group entries on the Y axis (blocks)")
+    ap.add_argument("--color-by", choices=["region", "script", "family"], default=None,
+                    help="How to color bars. Default: same as --group-by")
+    args = ap.parse_args()
+
+    if args.color_by is None:
+        args.color_by = args.group_by
+
+    with open(args.json, "r", encoding="utf-8") as f:
+        rows = json.load(f)
+
+    # Aggregate by language-script key (no double counting across scripts)
+    ls_kb = Counter()
+    ls_script = {}
+    ls_region = {}
+    ls_family = {}
+
+    for r in rows:
+        m = FNAME_RE.match(r.get("name", ""))
+        if not m:
+            continue
+        lang, script = m.groups()
+        key = f"{lang}_{script}"
+        kb = parse_size_to_kb(r["size"])
+
+        ls_kb[key] += kb
+        ls_script[key] = script
+        ls_region[key] = script_to_region(script)
+        ls_family[key] = script_to_family(script)
+
+    items = ls_kb.most_common()
+    if args.top_n and args.top_n > 0:
+        items = items[:args.top_n]
+
+    def get_label(key: str, which: str) -> str:
+        if which == "region":
+            return ls_region[key]
+        if which == "script":
+            return ls_script[key]
+        if which == "family":
+            return ls_family[key]
+        raise ValueError(which)
+
+    # Group and color labels
+    groups = defaultdict(list)
+    for key, kb in items:
+        groups[get_label(key, args.group_by)].append((key, kb))
+
+    group_order = sorted(groups.keys(), key=lambda g: sum(v for _, v in groups[g]), reverse=True)
+
+    ordered_keys = []
+    ordered_vals = []
+    ordered_group_labels = []
+    ordered_color_labels = []
+
+    for g in group_order:
+        for key, kb in sorted(groups[g], key=lambda x: x[1], reverse=True):
+            ordered_keys.append(key)
+            ordered_vals.append(kb)
+            ordered_group_labels.append(g)
+            ordered_color_labels.append(get_label(key, args.color_by))
+
+    # Color map (use matplotlib default cycle)
+    palette = plt.rcParams["axes.prop_cycle"].by_key().get("color", [])
+    if not palette:
+        palette = ["C0","C1","C2","C3","C4","C5","C6","C7","C8","C9"]
+
+    unique_color_labels = []
+    for cl in ordered_color_labels:
+        if cl not in unique_color_labels:
+            unique_color_labels.append(cl)
+
+    color_map = {cl: palette[i % len(palette)] for i, cl in enumerate(unique_color_labels)}
+    bar_colors = [color_map[cl] for cl in ordered_color_labels]
+
+    # Plot
+    fig_h = max(6, 0.28 * len(ordered_keys) + 1.5)
+    plt.figure(figsize=(12, fig_h))
+
+    y = range(len(ordered_keys))
+    plt.barh(y, ordered_vals, color=bar_colors)
+    plt.yticks(y, ordered_keys)
+    plt.xlabel("Total size (KB)")
+    plt.title(f"Language–script sizes (KB) | grouped by {args.group_by} | colored by {args.color_by}")
+
+    # Group separators + labels
+    start = 0
+    for g in group_order:
+        count = len(groups[g])
+        if start > 0:
+            plt.axhline(start - 0.5, linewidth=1)
+        mid = start + (count - 1) / 2
+        plt.text(
+            0, mid, f"  {g}",
+            va="center",
+            ha="left",
+            fontsize=10,
+            transform=plt.gca().get_yaxis_transform()
+        )
+        start += count
+
+    # Legend
+    handles = [plt.Line2D([0], [0], marker="s", linestyle="", color=color_map[cl]) for cl in unique_color_labels]
+    plt.legend(handles, unique_color_labels, title=args.color_by, loc="lower right")
+
+    plt.tight_layout()
+    if args.out:
+        plt.savefig(args.out, dpi=200)
+    else:
+        plt.show()
+
+if __name__ == "__main__":
+    main()
+

From 8e9993a06173a3e473f94fdca92fc6d308183414 Mon Sep 17 00:00:00 2001
From: klei22 <kauna@avalonwest.tech>
Date: Sat, 3 Jan 2026 12:36:09 -0800
Subject: [PATCH 07/10] Add scripts for language analysis

---
 data/flores200-res/filter_files_by_script.py  |   89 +
 data/flores200-res/filtered_files.json        | 1492 +++++++++++++++++
 data/flores200-res/get_dataset.sh             |    1 -
 data/flores200-res/plot_ipa_vs_text.py        |  367 ++++
 .../plot_multi_script_languages.py            |  249 +++
 .../plot_tokenization_vs_original.py          |  360 ++++
 data/flores200-res/tokenization_vs_origina.sh |   21 +
 data/flores200-res/tokenize.sh                |    7 +
 .../tokenize_and_annotate_sizes.py            |  254 +++
 9 files changed, 2839 insertions(+), 1 deletion(-)
 create mode 100644 data/flores200-res/filter_files_by_script.py
 create mode 100644 data/flores200-res/filtered_files.json
 create mode 100644 data/flores200-res/plot_ipa_vs_text.py
 create mode 100644 data/flores200-res/plot_multi_script_languages.py
 create mode 100644 data/flores200-res/plot_tokenization_vs_original.py
 create mode 100644 data/flores200-res/tokenization_vs_origina.sh
 create mode 100644 data/flores200-res/tokenize.sh
 create mode 100644 data/flores200-res/tokenize_and_annotate_sizes.py

diff --git a/data/flores200-res/filter_files_by_script.py b/data/flores200-res/filter_files_by_script.py
new file mode 100644
index 0000000000..3923918ddf
--- /dev/null
+++ b/data/flores200-res/filter_files_by_script.py
@@ -0,0 +1,89 @@
+
+#!/usr/bin/env python3
+"""
+filter_files_by_script.py
+
+Read files.json and emit a simplified JSON with only fields
+relevant to script/language analysis.
+
+Keeps:
+  - language (ISO 639-3)
+  - script (ISO 15924)
+  - lang_script (language_script)
+  - size_kb (float)
+  - filename (optional but useful)
+"""
+
+import json
+import re
+import argparse
+
+FNAME_RE = re.compile(r"^text_([a-z]{3})_([A-Za-z]{4})\.txt$")
+
+
+def parse_size_to_kb(size_str: str) -> float:
+    """
+    Convert ls -h style sizes to KB.
+    """
+    m = re.match(r"^\s*([0-9]*\.?[0-9]+)\s*([KMGTP]?)(B?)\s*$", size_str, re.IGNORECASE)
+    if not m:
+        raise ValueError(f"Unrecognized size string: {size_str!r}")
+
+    val = float(m.group(1))
+    unit = m.group(2).upper()
+
+    mult = {
+        "": 1.0 / 1024.0,  # bytes -> KB
+        "K": 1.0,
+        "M": 1024.0,
+        "G": 1024.0**2,
+        "T": 1024.0**3,
+        "P": 1024.0**4,
+    }[unit]
+
+    return val * mult
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--json", default="files.json", help="Input files.json")
+    ap.add_argument("--out", default="filtered_scripts.json", help="Output JSON")
+    ap.add_argument("--drop-filename", action="store_true",
+                    help="Do not include original filename in output")
+    args = ap.parse_args()
+
+    with open(args.json, "r", encoding="utf-8") as f:
+        rows = json.load(f)
+
+    filtered = []
+
+    for r in rows:
+        name = r.get("name", "")
+        m = FNAME_RE.match(name)
+        if not m:
+            continue
+
+        lang, script = m.groups()
+        size_kb = parse_size_to_kb(str(r["size"]))
+
+        entry = {
+            "language": lang,
+            "script": script,
+            "lang_script": f"{lang}_{script}",
+            "size_kb": size_kb,
+        }
+
+        if not args.drop_filename:
+            entry["filename"] = name
+
+        filtered.append(entry)
+
+    with open(args.out, "w", encoding="utf-8") as f:
+        json.dump(filtered, f, indent=2, ensure_ascii=False)
+
+    print(f"Wrote {len(filtered)} entries to {args.out}")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/data/flores200-res/filtered_files.json b/data/flores200-res/filtered_files.json
new file mode 100644
index 0000000000..8370ba3c16
--- /dev/null
+++ b/data/flores200-res/filtered_files.json
@@ -0,0 +1,1492 @@
+[
+  {
+    "language": "yue",
+    "script": "Hant",
+    "lang_script": "yue_Hant",
+    "size_kb": 221.0,
+    "filename": "text_yue_Hant.txt",
+    "tokenized_sizes": {
+      "tiktoken": 318.50390625
+    }
+  },
+  {
+    "language": "zho",
+    "script": "Hans",
+    "lang_script": "zho_Hans",
+    "size_kb": 235.0,
+    "filename": "text_zho_Hans.txt",
+    "tokenized_sizes": {
+      "tiktoken": 331.03125
+    }
+  },
+  {
+    "language": "ace",
+    "script": "Arab",
+    "lang_script": "ace_Arab",
+    "size_kb": 383.0,
+    "filename": "text_ace_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 492.009765625
+    }
+  },
+  {
+    "language": "ace",
+    "script": "Latn",
+    "lang_script": "ace_Latn",
+    "size_kb": 277.0,
+    "filename": "text_ace_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 226.68359375
+    }
+  },
+  {
+    "language": "acm",
+    "script": "Arab",
+    "lang_script": "acm_Arab",
+    "size_kb": 396.0,
+    "filename": "text_acm_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 439.6796875
+    }
+  },
+  {
+    "language": "acq",
+    "script": "Arab",
+    "lang_script": "acq_Arab",
+    "size_kb": 400.0,
+    "filename": "text_acq_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 447.302734375
+    }
+  },
+  {
+    "language": "aeb",
+    "script": "Arab",
+    "lang_script": "aeb_Arab",
+    "size_kb": 390.0,
+    "filename": "text_aeb_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 433.5234375
+    }
+  },
+  {
+    "language": "afr",
+    "script": "Latn",
+    "lang_script": "afr_Latn",
+    "size_kb": 272.0,
+    "filename": "text_afr_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 203.251953125
+    }
+  },
+  {
+    "language": "ajp",
+    "script": "Arab",
+    "lang_script": "ajp_Arab",
+    "size_kb": 377.0,
+    "filename": "text_ajp_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 415.3125
+    }
+  },
+  {
+    "language": "aka",
+    "script": "Latn",
+    "lang_script": "aka_Latn",
+    "size_kb": 279.0,
+    "filename": "text_aka_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 291.162109375
+    }
+  },
+  {
+    "language": "als",
+    "script": "Latn",
+    "lang_script": "als_Latn",
+    "size_kb": 306.0,
+    "filename": "text_als_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 276.4609375
+    }
+  },
+  {
+    "language": "amh",
+    "script": "Ethi",
+    "lang_script": "amh_Ethi",
+    "size_kb": 436.0,
+    "filename": "text_amh_Ethi.txt",
+    "tokenized_sizes": {
+      "tiktoken": 803.419921875
+    }
+  },
+  {
+    "language": "apc",
+    "script": "Arab",
+    "lang_script": "apc_Arab",
+    "size_kb": 376.0,
+    "filename": "text_apc_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 416.90625
+    }
+  },
+  {
+    "language": "arb",
+    "script": "Arab",
+    "lang_script": "arb_Arab",
+    "size_kb": 405.0,
+    "filename": "text_arb_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 453.7265625
+    }
+  },
+  {
+    "language": "arb",
+    "script": "Latn",
+    "lang_script": "arb_Latn",
+    "size_kb": 296.0,
+    "filename": "text_arb_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 262.181640625
+    }
+  },
+  {
+    "language": "ars",
+    "script": "Arab",
+    "lang_script": "ars_Arab",
+    "size_kb": 405.0,
+    "filename": "text_ars_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 454.37890625
+    }
+  },
+  {
+    "language": "ary",
+    "script": "Arab",
+    "lang_script": "ary_Arab",
+    "size_kb": 395.0,
+    "filename": "text_ary_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 434.587890625
+    }
+  },
+  {
+    "language": "arz",
+    "script": "Arab",
+    "lang_script": "arz_Arab",
+    "size_kb": 396.0,
+    "filename": "text_arz_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 437.30078125
+    }
+  },
+  {
+    "language": "asm",
+    "script": "Beng",
+    "lang_script": "asm_Beng",
+    "size_kb": 644.0,
+    "filename": "text_asm_Beng.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1005.07421875
+    }
+  },
+  {
+    "language": "ast",
+    "script": "Latn",
+    "lang_script": "ast_Latn",
+    "size_kb": 270.0,
+    "filename": "text_ast_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 197.443359375
+    }
+  },
+  {
+    "language": "awa",
+    "script": "Deva",
+    "lang_script": "awa_Deva",
+    "size_kb": 634.0,
+    "filename": "text_awa_Deva.txt",
+    "tokenized_sizes": {
+      "tiktoken": 742.212890625
+    }
+  },
+  {
+    "language": "ayr",
+    "script": "Latn",
+    "lang_script": "ayr_Latn",
+    "size_kb": 272.0,
+    "filename": "text_ayr_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 243.201171875
+    }
+  },
+  {
+    "language": "azb",
+    "script": "Arab",
+    "lang_script": "azb_Arab",
+    "size_kb": 412.0,
+    "filename": "text_azb_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 531.052734375
+    }
+  },
+  {
+    "language": "azj",
+    "script": "Latn",
+    "lang_script": "azj_Latn",
+    "size_kb": 320.0,
+    "filename": "text_azj_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 360.34765625
+    }
+  },
+  {
+    "language": "bak",
+    "script": "Cyrl",
+    "lang_script": "bak_Cyrl",
+    "size_kb": 470.0,
+    "filename": "text_bak_Cyrl.txt",
+    "tokenized_sizes": {
+      "tiktoken": 620.775390625
+    }
+  },
+  {
+    "language": "bam",
+    "script": "Latn",
+    "lang_script": "bam_Latn",
+    "size_kb": 265.0,
+    "filename": "text_bam_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 277.44921875
+    }
+  },
+  {
+    "language": "ban",
+    "script": "Latn",
+    "lang_script": "ban_Latn",
+    "size_kb": 282.0,
+    "filename": "text_ban_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 207.21875
+    }
+  },
+  {
+    "language": "bel",
+    "script": "Cyrl",
+    "lang_script": "bel_Cyrl",
+    "size_kb": 523.0,
+    "filename": "text_bel_Cyrl.txt",
+    "tokenized_sizes": {
+      "tiktoken": 677.169921875
+    }
+  },
+  {
+    "language": "bem",
+    "script": "Latn",
+    "lang_script": "bem_Latn",
+    "size_kb": 312.0,
+    "filename": "text_bem_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 257.189453125
+    }
+  },
+  {
+    "language": "ben",
+    "script": "Beng",
+    "lang_script": "ben_Beng",
+    "size_kb": 661.0,
+    "filename": "text_ben_Beng.txt",
+    "tokenized_sizes": {
+      "tiktoken": 991.65625
+    }
+  },
+  {
+    "language": "bho",
+    "script": "Deva",
+    "lang_script": "bho_Deva",
+    "size_kb": 626.0,
+    "filename": "text_bho_Deva.txt",
+    "tokenized_sizes": {
+      "tiktoken": 740.771484375
+    }
+  },
+  {
+    "language": "bjn",
+    "script": "Arab",
+    "lang_script": "bjn_Arab",
+    "size_kb": 428.0,
+    "filename": "text_bjn_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 517.115234375
+    }
+  },
+  {
+    "language": "bjn",
+    "script": "Latn",
+    "lang_script": "bjn_Latn",
+    "size_kb": 267.0,
+    "filename": "text_bjn_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 208.12109375
+    }
+  },
+  {
+    "language": "bod",
+    "script": "Tibt",
+    "lang_script": "bod_Tibt",
+    "size_kb": 840.0,
+    "filename": "text_bod_Tibt.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1532.25
+    }
+  },
+  {
+    "language": "bos",
+    "script": "Latn",
+    "lang_script": "bos_Latn",
+    "size_kb": 262.0,
+    "filename": "text_bos_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 229.4296875
+    }
+  },
+  {
+    "language": "bug",
+    "script": "Latn",
+    "lang_script": "bug_Latn",
+    "size_kb": 278.0,
+    "filename": "text_bug_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 230.97265625
+    }
+  },
+  {
+    "language": "bul",
+    "script": "Cyrl",
+    "lang_script": "bul_Cyrl",
+    "size_kb": 479.0,
+    "filename": "text_bul_Cyrl.txt",
+    "tokenized_sizes": {
+      "tiktoken": 569.134765625
+    }
+  },
+  {
+    "language": "cat",
+    "script": "Latn",
+    "lang_script": "cat_Latn",
+    "size_kb": 285.0,
+    "filename": "text_cat_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 201.0390625
+    }
+  },
+  {
+    "language": "ceb",
+    "script": "Latn",
+    "lang_script": "ceb_Latn",
+    "size_kb": 304.0,
+    "filename": "text_ceb_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 234.4296875
+    }
+  },
+  {
+    "language": "ces",
+    "script": "Latn",
+    "lang_script": "ces_Latn",
+    "size_kb": 274.0,
+    "filename": "text_ces_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 273.87109375
+    }
+  },
+  {
+    "language": "cjk",
+    "script": "Latn",
+    "lang_script": "cjk_Latn",
+    "size_kb": 272.0,
+    "filename": "text_cjk_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 226.83984375
+    }
+  },
+  {
+    "language": "ckb",
+    "script": "Arab",
+    "lang_script": "ckb_Arab",
+    "size_kb": 287.0,
+    "filename": "text_ckb_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 423.845703125
+    }
+  },
+  {
+    "language": "crh",
+    "script": "Latn",
+    "lang_script": "crh_Latn",
+    "size_kb": 285.0,
+    "filename": "text_crh_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 258.728515625
+    }
+  },
+  {
+    "language": "cym",
+    "script": "Latn",
+    "lang_script": "cym_Latn",
+    "size_kb": 272.0,
+    "filename": "text_cym_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 246.384765625
+    }
+  },
+  {
+    "language": "dan",
+    "script": "Latn",
+    "lang_script": "dan_Latn",
+    "size_kb": 266.0,
+    "filename": "text_dan_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 198.712890625
+    }
+  },
+  {
+    "language": "deu",
+    "script": "Latn",
+    "lang_script": "deu_Latn",
+    "size_kb": 301.0,
+    "filename": "text_deu_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 224.591796875
+    }
+  },
+  {
+    "language": "dik",
+    "script": "Latn",
+    "lang_script": "dik_Latn",
+    "size_kb": 245.0,
+    "filename": "text_dik_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 258.310546875
+    }
+  },
+  {
+    "language": "dyu",
+    "script": "Latn",
+    "lang_script": "dyu_Latn",
+    "size_kb": 272.0,
+    "filename": "text_dyu_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 230.681640625
+    }
+  },
+  {
+    "language": "dzo",
+    "script": "Tibt",
+    "lang_script": "dzo_Tibt",
+    "size_kb": 921.0,
+    "filename": "text_dzo_Tibt.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1679.669921875
+    }
+  },
+  {
+    "language": "ell",
+    "script": "Grek",
+    "lang_script": "ell_Grek",
+    "size_kb": 550.0,
+    "filename": "text_ell_Grek.txt",
+    "tokenized_sizes": {
+      "tiktoken": 674.869140625
+    }
+  },
+  {
+    "language": "eng",
+    "script": "Latn",
+    "lang_script": "eng_Latn",
+    "size_kb": 254.0,
+    "filename": "text_eng_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 107.015625
+    }
+  },
+  {
+    "language": "epo",
+    "script": "Latn",
+    "lang_script": "epo_Latn",
+    "size_kb": 258.0,
+    "filename": "text_epo_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 212.609375
+    }
+  },
+  {
+    "language": "est",
+    "script": "Latn",
+    "lang_script": "est_Latn",
+    "size_kb": 257.0,
+    "filename": "text_est_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 221.576171875
+    }
+  },
+  {
+    "language": "eus",
+    "script": "Latn",
+    "lang_script": "eus_Latn",
+    "size_kb": 270.0,
+    "filename": "text_eus_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 220.109375
+    }
+  },
+  {
+    "language": "ewe",
+    "script": "Latn",
+    "lang_script": "ewe_Latn",
+    "size_kb": 271.0,
+    "filename": "text_ewe_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 301.962890625
+    }
+  },
+  {
+    "language": "fao",
+    "script": "Latn",
+    "lang_script": "fao_Latn",
+    "size_kb": 278.0,
+    "filename": "text_fao_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 249.02734375
+    }
+  },
+  {
+    "language": "fij",
+    "script": "Latn",
+    "lang_script": "fij_Latn",
+    "size_kb": 297.0,
+    "filename": "text_fij_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 240.689453125
+    }
+  },
+  {
+    "language": "fin",
+    "script": "Latn",
+    "lang_script": "fin_Latn",
+    "size_kb": 281.0,
+    "filename": "text_fin_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 238.443359375
+    }
+  },
+  {
+    "language": "fon",
+    "script": "Latn",
+    "lang_script": "fon_Latn",
+    "size_kb": 320.0,
+    "filename": "text_fon_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 422.541015625
+    }
+  },
+  {
+    "language": "fra",
+    "script": "Latn",
+    "lang_script": "fra_Latn",
+    "size_kb": 313.0,
+    "filename": "text_fra_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 209.17578125
+    }
+  },
+  {
+    "language": "fur",
+    "script": "Latn",
+    "lang_script": "fur_Latn",
+    "size_kb": 287.0,
+    "filename": "text_fur_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 216.59375
+    }
+  },
+  {
+    "language": "fuv",
+    "script": "Latn",
+    "lang_script": "fuv_Latn",
+    "size_kb": 243.0,
+    "filename": "text_fuv_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 208.873046875
+    }
+  },
+  {
+    "language": "gaz",
+    "script": "Latn",
+    "lang_script": "gaz_Latn",
+    "size_kb": 305.0,
+    "filename": "text_gaz_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 264.150390625
+    }
+  },
+  {
+    "language": "gla",
+    "script": "Latn",
+    "lang_script": "gla_Latn",
+    "size_kb": 325.0,
+    "filename": "text_gla_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 281.35546875
+    }
+  },
+  {
+    "language": "gle",
+    "script": "Latn",
+    "lang_script": "gle_Latn",
+    "size_kb": 312.0,
+    "filename": "text_gle_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 266.9453125
+    }
+  },
+  {
+    "language": "glg",
+    "script": "Latn",
+    "lang_script": "glg_Latn",
+    "size_kb": 287.0,
+    "filename": "text_glg_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 200.67578125
+    }
+  },
+  {
+    "language": "grn",
+    "script": "Latn",
+    "lang_script": "grn_Latn",
+    "size_kb": 275.0,
+    "filename": "text_grn_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 256.6171875
+    }
+  },
+  {
+    "language": "guj",
+    "script": "Gujr",
+    "lang_script": "guj_Gujr",
+    "size_kb": 633.0,
+    "filename": "text_guj_Gujr.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1259.890625
+    }
+  },
+  {
+    "language": "hat",
+    "script": "Latn",
+    "lang_script": "hat_Latn",
+    "size_kb": 240.0,
+    "filename": "text_hat_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 200.294921875
+    }
+  },
+  {
+    "language": "hau",
+    "script": "Latn",
+    "lang_script": "hau_Latn",
+    "size_kb": 274.0,
+    "filename": "text_hau_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 225.41015625
+    }
+  },
+  {
+    "language": "heb",
+    "script": "Hebr",
+    "lang_script": "heb_Hebr",
+    "size_kb": 352.0,
+    "filename": "text_heb_Hebr.txt",
+    "tokenized_sizes": {
+      "tiktoken": 452.626953125
+    }
+  },
+  {
+    "language": "hin",
+    "script": "Deva",
+    "lang_script": "hin_Deva",
+    "size_kb": 646.0,
+    "filename": "text_hin_Deva.txt",
+    "tokenized_sizes": {
+      "tiktoken": 769.654296875
+    }
+  },
+  {
+    "language": "hne",
+    "script": "Deva",
+    "lang_script": "hne_Deva",
+    "size_kb": 624.0,
+    "filename": "text_hne_Deva.txt",
+    "tokenized_sizes": {
+      "tiktoken": 743.744140625
+    }
+  },
+  {
+    "language": "hrv",
+    "script": "Latn",
+    "lang_script": "hrv_Latn",
+    "size_kb": 256.0,
+    "filename": "text_hrv_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 224.17578125
+    }
+  },
+  {
+    "language": "hun",
+    "script": "Latn",
+    "lang_script": "hun_Latn",
+    "size_kb": 293.0,
+    "filename": "text_hun_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 277.033203125
+    }
+  },
+  {
+    "language": "hye",
+    "script": "Armn",
+    "lang_script": "hye_Armn",
+    "size_kb": 518.0,
+    "filename": "text_hye_Armn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1028.42578125
+    }
+  },
+  {
+    "language": "ibo",
+    "script": "Latn",
+    "lang_script": "ibo_Latn",
+    "size_kb": 306.0,
+    "filename": "text_ibo_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 355.85546875
+    }
+  },
+  {
+    "language": "ilo",
+    "script": "Latn",
+    "lang_script": "ilo_Latn",
+    "size_kb": 307.0,
+    "filename": "text_ilo_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 236.40234375
+    }
+  },
+  {
+    "language": "ind",
+    "script": "Latn",
+    "lang_script": "ind_Latn",
+    "size_kb": 275.0,
+    "filename": "text_ind_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 207.615234375
+    }
+  },
+  {
+    "language": "isl",
+    "script": "Latn",
+    "lang_script": "isl_Latn",
+    "size_kb": 277.0,
+    "filename": "text_isl_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 253.580078125
+    }
+  },
+  {
+    "language": "ita",
+    "script": "Latn",
+    "lang_script": "ita_Latn",
+    "size_kb": 301.0,
+    "filename": "text_ita_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 210.482421875
+    }
+  },
+  {
+    "language": "jav",
+    "script": "Latn",
+    "lang_script": "jav_Latn",
+    "size_kb": 264.0,
+    "filename": "text_jav_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 203.388671875
+    }
+  },
+  {
+    "language": "jpn",
+    "script": "Jpan",
+    "lang_script": "jpn_Jpan",
+    "size_kb": 322.0,
+    "filename": "text_jpn_Jpan.txt",
+    "tokenized_sizes": {
+      "tiktoken": 309.5625
+    }
+  },
+  {
+    "language": "kab",
+    "script": "Latn",
+    "lang_script": "kab_Latn",
+    "size_kb": 268.0,
+    "filename": "text_kab_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 260.916015625
+    }
+  },
+  {
+    "language": "kac",
+    "script": "Latn",
+    "lang_script": "kac_Latn",
+    "size_kb": 322.0,
+    "filename": "text_kac_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 275.46875
+    }
+  },
+  {
+    "language": "kam",
+    "script": "Latn",
+    "lang_script": "kam_Latn",
+    "size_kb": 258.0,
+    "filename": "text_kam_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 242.33984375
+    }
+  },
+  {
+    "language": "kan",
+    "script": "Knda",
+    "lang_script": "kan_Knda",
+    "size_kb": 718.0,
+    "filename": "text_kan_Knda.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1404.755859375
+    }
+  },
+  {
+    "language": "kas",
+    "script": "Arab",
+    "lang_script": "kas_Arab",
+    "size_kb": 437.0,
+    "filename": "text_kas_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 636.080078125
+    }
+  },
+  {
+    "language": "kas",
+    "script": "Deva",
+    "lang_script": "kas_Deva",
+    "size_kb": 608.0,
+    "filename": "text_kas_Deva.txt",
+    "tokenized_sizes": {
+      "tiktoken": 725.751953125
+    }
+  },
+  {
+    "language": "kat",
+    "script": "Geor",
+    "lang_script": "kat_Geor",
+    "size_kb": 747.0,
+    "filename": "text_kat_Geor.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1425.55078125
+    }
+  },
+  {
+    "language": "kaz",
+    "script": "Cyrl",
+    "lang_script": "kaz_Cyrl",
+    "size_kb": 478.0,
+    "filename": "text_kaz_Cyrl.txt",
+    "tokenized_sizes": {
+      "tiktoken": 611.3046875
+    }
+  },
+  {
+    "language": "kbp",
+    "script": "Latn",
+    "lang_script": "kbp_Latn",
+    "size_kb": 348.0,
+    "filename": "text_kbp_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 503.822265625
+    }
+  },
+  {
+    "language": "kea",
+    "script": "Latn",
+    "lang_script": "kea_Latn",
+    "size_kb": 258.0,
+    "filename": "text_kea_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 203.4453125
+    }
+  },
+  {
+    "language": "khk",
+    "script": "Cyrl",
+    "lang_script": "khk_Cyrl",
+    "size_kb": 485.0,
+    "filename": "text_khk_Cyrl.txt",
+    "tokenized_sizes": {
+      "tiktoken": 662.623046875
+    }
+  },
+  {
+    "language": "khm",
+    "script": "Khmr",
+    "lang_script": "khm_Khmr",
+    "size_kb": 845.0,
+    "filename": "text_khm_Khmr.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1578.11328125
+    }
+  },
+  {
+    "language": "kik",
+    "script": "Latn",
+    "lang_script": "kik_Latn",
+    "size_kb": 329.0,
+    "filename": "text_kik_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 357.6328125
+    }
+  },
+  {
+    "language": "kin",
+    "script": "Latn",
+    "lang_script": "kin_Latn",
+    "size_kb": 288.0,
+    "filename": "text_kin_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 247.646484375
+    }
+  },
+  {
+    "language": "kir",
+    "script": "Cyrl",
+    "lang_script": "kir_Cyrl",
+    "size_kb": 477.0,
+    "filename": "text_kir_Cyrl.txt",
+    "tokenized_sizes": {
+      "tiktoken": 592.859375
+    }
+  },
+  {
+    "language": "kmb",
+    "script": "Latn",
+    "lang_script": "kmb_Latn",
+    "size_kb": 282.0,
+    "filename": "text_kmb_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 244.283203125
+    }
+  },
+  {
+    "language": "kmr",
+    "script": "Latn",
+    "lang_script": "kmr_Latn",
+    "size_kb": 279.0,
+    "filename": "text_kmr_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 256.443359375
+    }
+  },
+  {
+    "language": "knc",
+    "script": "Arab",
+    "lang_script": "knc_Arab",
+    "size_kb": 405.0,
+    "filename": "text_knc_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 488.65234375
+    }
+  },
+  {
+    "language": "knc",
+    "script": "Latn",
+    "lang_script": "knc_Latn",
+    "size_kb": 282.0,
+    "filename": "text_knc_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 269.248046875
+    }
+  },
+  {
+    "language": "kon",
+    "script": "Latn",
+    "lang_script": "kon_Latn",
+    "size_kb": 289.0,
+    "filename": "text_kon_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 227.642578125
+    }
+  },
+  {
+    "language": "kor",
+    "script": "Hang",
+    "lang_script": "kor_Hang",
+    "size_kb": 304.0,
+    "filename": "text_kor_Hang.txt",
+    "tokenized_sizes": {
+      "tiktoken": 523.220703125
+    }
+  },
+  {
+    "language": "lao",
+    "script": "Laoo",
+    "lang_script": "lao_Laoo",
+    "size_kb": 692.0,
+    "filename": "text_lao_Laoo.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1354.791015625
+    }
+  },
+  {
+    "language": "lij",
+    "script": "Latn",
+    "lang_script": "lij_Latn",
+    "size_kb": 296.0,
+    "filename": "text_lij_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 239.205078125
+    }
+  },
+  {
+    "language": "lim",
+    "script": "Latn",
+    "lang_script": "lim_Latn",
+    "size_kb": 272.0,
+    "filename": "text_lim_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 214.634765625
+    }
+  },
+  {
+    "language": "lin",
+    "script": "Latn",
+    "lang_script": "lin_Latn",
+    "size_kb": 274.0,
+    "filename": "text_lin_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 212.880859375
+    }
+  },
+  {
+    "language": "lit",
+    "script": "Latn",
+    "lang_script": "lit_Latn",
+    "size_kb": 270.0,
+    "filename": "text_lit_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 256.07421875
+    }
+  },
+  {
+    "language": "lmo",
+    "script": "Latn",
+    "lang_script": "lmo_Latn",
+    "size_kb": 294.0,
+    "filename": "text_lmo_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 247.615234375
+    }
+  },
+  {
+    "language": "ltg",
+    "script": "Latn",
+    "lang_script": "ltg_Latn",
+    "size_kb": 266.0,
+    "filename": "text_ltg_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 250.259765625
+    }
+  },
+  {
+    "language": "ltz",
+    "script": "Latn",
+    "lang_script": "ltz_Latn",
+    "size_kb": 292.0,
+    "filename": "text_ltz_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 235.171875
+    }
+  },
+  {
+    "language": "lua",
+    "script": "Latn",
+    "lang_script": "lua_Latn",
+    "size_kb": 274.0,
+    "filename": "text_lua_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 223.138671875
+    }
+  },
+  {
+    "language": "lug",
+    "script": "Latn",
+    "lang_script": "lug_Latn",
+    "size_kb": 262.0,
+    "filename": "text_lug_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 227.66015625
+    }
+  },
+  {
+    "language": "luo",
+    "script": "Latn",
+    "lang_script": "luo_Latn",
+    "size_kb": 266.0,
+    "filename": "text_luo_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 214.599609375
+    }
+  },
+  {
+    "language": "lus",
+    "script": "Latn",
+    "lang_script": "lus_Latn",
+    "size_kb": 279.0,
+    "filename": "text_lus_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 219.353515625
+    }
+  },
+  {
+    "language": "lvs",
+    "script": "Latn",
+    "lang_script": "lvs_Latn",
+    "size_kb": 283.0,
+    "filename": "text_lvs_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 265.765625
+    }
+  },
+  {
+    "language": "mag",
+    "script": "Deva",
+    "lang_script": "mag_Deva",
+    "size_kb": 625.0,
+    "filename": "text_mag_Deva.txt",
+    "tokenized_sizes": {
+      "tiktoken": 745.328125
+    }
+  },
+  {
+    "language": "mai",
+    "script": "Deva",
+    "lang_script": "mai_Deva",
+    "size_kb": 641.0,
+    "filename": "text_mai_Deva.txt",
+    "tokenized_sizes": {
+      "tiktoken": 767.1484375
+    }
+  },
+  {
+    "language": "mal",
+    "script": "Mlym",
+    "lang_script": "mal_Mlym",
+    "size_kb": 787.0,
+    "filename": "text_mal_Mlym.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1565.658203125
+    }
+  },
+  {
+    "language": "mar",
+    "script": "Deva",
+    "lang_script": "mar_Deva",
+    "size_kb": 677.0,
+    "filename": "text_mar_Deva.txt",
+    "tokenized_sizes": {
+      "tiktoken": 810.67578125
+    }
+  },
+  {
+    "language": "min",
+    "script": "Arab",
+    "lang_script": "min_Arab",
+    "size_kb": 441.0,
+    "filename": "text_min_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 539.71875
+    }
+  },
+  {
+    "language": "min",
+    "script": "Latn",
+    "lang_script": "min_Latn",
+    "size_kb": 271.0,
+    "filename": "text_min_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 206.884765625
+    }
+  },
+  {
+    "language": "mkd",
+    "script": "Cyrl",
+    "lang_script": "mkd_Cyrl",
+    "size_kb": 480.0,
+    "filename": "text_mkd_Cyrl.txt",
+    "tokenized_sizes": {
+      "tiktoken": 564.958984375
+    }
+  },
+  {
+    "language": "mlt",
+    "script": "Latn",
+    "lang_script": "mlt_Latn",
+    "size_kb": 295.0,
+    "filename": "text_mlt_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 281.068359375
+    }
+  },
+  {
+    "language": "mni",
+    "script": "Beng",
+    "lang_script": "mni_Beng",
+    "size_kb": 701.0,
+    "filename": "text_mni_Beng.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1048.951171875
+    }
+  },
+  {
+    "language": "mos",
+    "script": "Latn",
+    "lang_script": "mos_Latn",
+    "size_kb": 262.0,
+    "filename": "text_mos_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 264.98828125
+    }
+  },
+  {
+    "language": "mri",
+    "script": "Latn",
+    "lang_script": "mri_Latn",
+    "size_kb": 294.0,
+    "filename": "text_mri_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 255.35546875
+    }
+  },
+  {
+    "language": "mya",
+    "script": "Mymr",
+    "lang_script": "mya_Mymr",
+    "size_kb": 890.0,
+    "filename": "text_mya_Mymr.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1738.203125
+    }
+  },
+  {
+    "language": "nld",
+    "script": "Latn",
+    "lang_script": "nld_Latn",
+    "size_kb": 283.0,
+    "filename": "text_nld_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 206.50390625
+    }
+  },
+  {
+    "language": "nno",
+    "script": "Latn",
+    "lang_script": "nno_Latn",
+    "size_kb": 263.0,
+    "filename": "text_nno_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 202.712890625
+    }
+  },
+  {
+    "language": "nob",
+    "script": "Latn",
+    "lang_script": "nob_Latn",
+    "size_kb": 261.0,
+    "filename": "text_nob_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 195.431640625
+    }
+  },
+  {
+    "language": "npi",
+    "script": "Deva",
+    "lang_script": "npi_Deva",
+    "size_kb": 650.0,
+    "filename": "text_npi_Deva.txt",
+    "tokenized_sizes": {
+      "tiktoken": 782.9453125
+    }
+  },
+  {
+    "language": "nso",
+    "script": "Latn",
+    "lang_script": "nso_Latn",
+    "size_kb": 298.0,
+    "filename": "text_nso_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 242.853515625
+    }
+  },
+  {
+    "language": "nus",
+    "script": "Latn",
+    "lang_script": "nus_Latn",
+    "size_kb": 335.0,
+    "filename": "text_nus_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 436.869140625
+    }
+  },
+  {
+    "language": "nya",
+    "script": "Latn",
+    "lang_script": "nya_Latn",
+    "size_kb": 285.0,
+    "filename": "text_nya_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 237.115234375
+    }
+  },
+  {
+    "language": "oci",
+    "script": "Latn",
+    "lang_script": "oci_Latn",
+    "size_kb": 298.0,
+    "filename": "text_oci_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 216.3203125
+    }
+  },
+  {
+    "language": "ory",
+    "script": "Orya",
+    "lang_script": "ory_Orya",
+    "size_kb": 693.0,
+    "filename": "text_ory_Orya.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1374.439453125
+    }
+  },
+  {
+    "language": "pag",
+    "script": "Latn",
+    "lang_script": "pag_Latn",
+    "size_kb": 253.0,
+    "filename": "text_pag_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 174.919921875
+    }
+  },
+  {
+    "language": "pan",
+    "script": "Guru",
+    "lang_script": "pan_Guru",
+    "size_kb": 657.0,
+    "filename": "text_pan_Guru.txt",
+    "tokenized_sizes": {
+      "tiktoken": 815.029296875
+    }
+  },
+  {
+    "language": "pap",
+    "script": "Latn",
+    "lang_script": "pap_Latn",
+    "size_kb": 274.0,
+    "filename": "text_pap_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 208.033203125
+    }
+  },
+  {
+    "language": "pbt",
+    "script": "Arab",
+    "lang_script": "pbt_Arab",
+    "size_kb": 421.0,
+    "filename": "text_pbt_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 554.59375
+    }
+  },
+  {
+    "language": "pes",
+    "script": "Arab",
+    "lang_script": "pes_Arab",
+    "size_kb": 430.0,
+    "filename": "text_pes_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 547.0625
+    }
+  },
+  {
+    "language": "plt",
+    "script": "Latn",
+    "lang_script": "plt_Latn",
+    "size_kb": 319.0,
+    "filename": "text_plt_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 270.25390625
+    }
+  },
+  {
+    "language": "pol",
+    "script": "Latn",
+    "lang_script": "pol_Latn",
+    "size_kb": 286.0,
+    "filename": "text_pol_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 280.4609375
+    }
+  },
+  {
+    "language": "por",
+    "script": "Latn",
+    "lang_script": "por_Latn",
+    "size_kb": 283.0,
+    "filename": "text_por_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 202.765625
+    }
+  },
+  {
+    "language": "prs",
+    "script": "Arab",
+    "lang_script": "prs_Arab",
+    "size_kb": 413.0,
+    "filename": "text_prs_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 526.087890625
+    }
+  },
+  {
+    "language": "quy",
+    "script": "Latn",
+    "lang_script": "quy_Latn",
+    "size_kb": 273.0,
+    "filename": "text_quy_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 230.93359375
+    }
+  },
+  {
+    "language": "ron",
+    "script": "Latn",
+    "lang_script": "ron_Latn",
+    "size_kb": 301.0,
+    "filename": "text_ron_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 259.185546875
+    }
+  }
+]
\ No newline at end of file
diff --git a/data/flores200-res/get_dataset.sh b/data/flores200-res/get_dataset.sh
index b3096de376..db841c7057 100644
--- a/data/flores200-res/get_dataset.sh
+++ b/data/flores200-res/get_dataset.sh
@@ -98,7 +98,6 @@ lang_array=(
 #   "text_jav_Latn"
 #   "text_jpn_Jpan"
 #   "text_kab_Latn"
-#
 #   "text_kac_Latn"
 #   "text_kam_Latn"
 #   "text_kan_Knda"
diff --git a/data/flores200-res/plot_ipa_vs_text.py b/data/flores200-res/plot_ipa_vs_text.py
new file mode 100644
index 0000000000..07aeb52519
--- /dev/null
+++ b/data/flores200-res/plot_ipa_vs_text.py
@@ -0,0 +1,367 @@
+#!/usr/bin/env python3
+"""
+plot_ipa_vs_text.py
+
+Graphs IPA vs raw text sizes for paired files across directories:
+
+  text/<text_*.txt>
+  ipa/<ipa_text_*.txt>
+
+Defaults:
+  --text-dir text/
+  --ipa-dir  ipa/
+
+Produces:
+- scatter: IPA bytes vs raw bytes
+- bar: IPA/raw ratio
+- bar: delta bytes (IPA - raw)
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+from dataclasses import dataclass
+from pathlib import Path
+import math
+
+from typing import Dict, List, Optional, Tuple
+
+import matplotlib.pyplot as plt
+
+
+@dataclass
+class PairStats:
+    lang: str
+    raw_path: Path
+    ipa_path: Path
+    raw_bytes: int
+    ipa_bytes: int
+    raw_chars: int
+    ipa_chars: int
+    raw_lines: int
+    ipa_lines: int
+
+    @property
+    def ratio_bytes(self) -> float:
+        return (self.ipa_bytes / self.raw_bytes) if self.raw_bytes else float("inf")
+
+    @property
+    def delta_bytes(self) -> int:
+        return self.ipa_bytes - self.raw_bytes
+
+
+def read_stats(p: Path) -> Tuple[int, int, int]:
+    """
+    Returns (utf8_bytes, chars, lines).
+    """
+    data = p.read_text(encoding="utf-8", errors="replace")
+    b = len(data.encode("utf-8"))
+    c = len(data)
+    lines = data.count("\n") + (1 if data and not data.endswith("\n") else 0)
+    return b, c, lines
+
+
+def discover_pairs(text_dir: Path, ipa_dir: Path) -> List[Tuple[str, Path, Path]]:
+    """
+    Finds pairs across directories:
+      text_dir/text_<lang>.txt
+      ipa_dir/ipa_text_<lang>.txt
+    """
+    raw_map: Dict[str, Path] = {}
+    ipa_map: Dict[str, Path] = {}
+
+    for p in text_dir.iterdir():
+        if p.is_file() and p.name.startswith("text_") and p.name.endswith(".txt"):
+            lang = p.name[len("text_") : -len(".txt")]
+            raw_map[lang] = p
+
+    for p in ipa_dir.iterdir():
+        if p.is_file() and p.name.startswith("ipa_text_") and p.name.endswith(".txt"):
+            lang = p.name[len("ipa_text_") : -len(".txt")]
+            ipa_map[lang] = p
+
+    langs = sorted(set(raw_map) & set(ipa_map))
+    return [(lang, raw_map[lang], ipa_map[lang]) for lang in langs]
+
+
+def make_scatter(stats: List[PairStats], outpath: Optional[Path], title: str) -> None:
+    x = [s.raw_bytes for s in stats]
+    y = [s.ipa_bytes for s in stats]
+    labels = [s.lang for s in stats]
+
+    plt.figure()
+    plt.scatter(x, y)
+
+    for xi, yi, lab in zip(x, y, labels):
+        plt.annotate(lab, (xi, yi), textcoords="offset points", xytext=(6, 4))
+
+    plt.xlabel("Raw text size (UTF-8 bytes)")
+    plt.ylabel("IPA text size (UTF-8 bytes)")
+    plt.title(title)
+    plt.grid(True, linestyle="--", linewidth=0.5)
+
+    if outpath:
+        plt.tight_layout()
+        plt.savefig(outpath, dpi=200)
+        plt.close()
+    else:
+        plt.show()
+
+
+def make_bar(
+    stats: List[PairStats],
+    values: List[float],
+    ylabel: str,
+    outpath: Optional[Path],
+    title: str,
+) -> None:
+    langs = [s.lang for s in stats]
+    plt.figure(figsize=(max(8, 0.8 * len(langs)), 5))
+    plt.bar(langs, values)
+    plt.ylabel(ylabel)
+    plt.title(title)
+    plt.xticks(rotation=35, ha="right")
+    plt.grid(True, axis="y", linestyle="--", linewidth=0.5)
+
+    if outpath:
+        plt.tight_layout()
+        plt.savefig(outpath, dpi=200)
+        plt.close()
+    else:
+        plt.show()
+
+
+
+def _mean_std(vals: List[float]) -> Tuple[float, float]:
+    """
+    Population mean/std (ddof=0) over vals.
+    """
+    if not vals:
+        return 0.0, 0.0
+    m = sum(vals) / len(vals)
+    var = sum((v - m) ** 2 for v in vals) / len(vals)
+    return m, math.sqrt(var)
+
+
+def make_back_to_back_bar(
+        stats: List[PairStats],
+        outpath: Optional[Path],
+        title: str = "Raw vs IPA Text Size (UTF-8 bytes)",
+        ) -> None:
+    """
+    Back-to-back horizontal bar chart:
+        - Raw text on the left (negative)
+      - IPA text on the right (positive)
+
+    Adds:
+        - dotted mean lines for raw and ipa
+      - dotted ±1 stddev lines for raw and ipa
+    """
+    langs = [s.lang for s in stats]
+    raw_bytes = [float(s.raw_bytes) for s in stats]
+    ipa_bytes = [float(s.ipa_bytes) for s in stats]
+
+    raw_vals = [-b for b in raw_bytes]   # negative for left side
+    ipa_vals = ipa_bytes                # positive for right side
+
+    raw_mean, raw_std = _mean_std(raw_bytes)
+    ipa_mean, ipa_std = _mean_std(ipa_bytes)
+
+    y = range(len(langs))
+
+    plt.figure(figsize=(10, max(5, 0.5 * len(langs))))
+    plt.barh(y, raw_vals, label="Raw text", alpha=0.7)
+    plt.barh(y, ipa_vals, label="IPA text", alpha=0.7)
+
+    plt.yticks(y, langs)
+    plt.axvline(0, color="black", linewidth=1)
+
+    # Mean lines (dotted)
+    plt.axvline(-raw_mean, linestyle=":", linewidth=2, label=f"Raw mean ({raw_mean:.0f})")
+    plt.axvline(ipa_mean, linestyle=":", linewidth=2, label=f"IPA mean ({ipa_mean:.0f})")
+
+    # ±1 stddev lines (dotted, lighter)
+    # Raw side
+    plt.axvline(-(raw_mean - raw_std), linestyle=":", linewidth=1)
+    plt.axvline(-(raw_mean + raw_std), linestyle=":", linewidth=1)
+    # IPA side
+    plt.axvline(ipa_mean - ipa_std, linestyle=":", linewidth=1)
+    plt.axvline(ipa_mean + ipa_std, linestyle=":", linewidth=1)
+
+    plt.xlabel("UTF-8 bytes")
+    plt.title(
+            f"{title}\n"
+            f"Raw mean={raw_mean:.0f}, std={raw_std:.0f} | "
+            f"IPA mean={ipa_mean:.0f}, std={ipa_std:.0f}"
+            )
+    plt.grid(True, axis="x", linestyle="--", linewidth=0.5)
+
+    # Symmetric x-limits
+    max_val = max(max(ipa_vals), max(abs(v) for v in raw_vals))
+    # also include mean±std in bounds
+    max_val = max(
+            max_val,
+            raw_mean + raw_std,
+            ipa_mean + ipa_std,
+            )
+    plt.xlim(-max_val * 1.15, max_val * 1.15)
+
+    plt.legend(loc="best")
+
+    if outpath:
+        plt.tight_layout()
+        plt.savefig(outpath, dpi=200)
+        plt.close()
+    else:
+        plt.show()
+
+    # Optional: print summary to console (handy)
+    print(f"[back-to-back] Raw bytes: mean={raw_mean:.2f}, std={raw_std:.2f}")
+    print(f"[back-to-back] IPA bytes: mean={ipa_mean:.2f}, std={ipa_std:.2f}")
+
+def write_csv(stats: List[PairStats], out_csv: Path) -> None:
+    out_csv.parent.mkdir(parents=True, exist_ok=True)
+    with out_csv.open("w", newline="", encoding="utf-8") as f:
+        w = csv.writer(f)
+        w.writerow(
+            [
+                "lang",
+                "raw_path",
+                "ipa_path",
+                "raw_bytes",
+                "ipa_bytes",
+                "ratio_bytes",
+                "delta_bytes",
+                "raw_chars",
+                "ipa_chars",
+                "raw_lines",
+                "ipa_lines",
+            ]
+        )
+        for s in stats:
+            w.writerow(
+                [
+                    s.lang,
+                    str(s.raw_path),
+                    str(s.ipa_path),
+                    s.raw_bytes,
+                    s.ipa_bytes,
+                    f"{s.ratio_bytes:.6f}",
+                    s.delta_bytes,
+                    s.raw_chars,
+                    s.ipa_chars,
+                    s.raw_lines,
+                    s.ipa_lines,
+                ]
+            )
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description="Graph IPA vs raw text sizes across folders.")
+    ap.add_argument("--text-dir", default="text", help="Directory with text_<lang>.txt files (default: text/)")
+    ap.add_argument("--ipa-dir", default="ipa", help="Directory with ipa_text_<lang>.txt files (default: ipa/)")
+    ap.add_argument("--save", action="store_true", help="Save plots instead of showing them.")
+    ap.add_argument("--outdir", default="plots_out", help="Output directory when --save is used.")
+    ap.add_argument("--csv", action="store_true", help="Also write CSV of statistics.")
+    ap.add_argument(
+        "--sort",
+        choices=["lang", "raw_bytes", "ipa_bytes", "ratio", "delta"],
+        default="lang",
+        help="Sort order for plots.",
+    )
+    ap.add_argument(
+        "--title",
+        default="IPA vs Raw Text Size (UTF-8 bytes)",
+        help="Title for scatter plot.",
+    )
+    args = ap.parse_args()
+
+    text_dir = Path(args.text_dir)
+    ipa_dir = Path(args.ipa_dir)
+
+    if not text_dir.exists():
+        raise SystemExit(f"text-dir not found: {text_dir}")
+    if not ipa_dir.exists():
+        raise SystemExit(f"ipa-dir not found: {ipa_dir}")
+
+    pairs = discover_pairs(text_dir, ipa_dir)
+    if not pairs:
+        raise SystemExit("No matching text_/ipa_text_ pairs found.")
+
+    stats: List[PairStats] = []
+    for lang, raw_p, ipa_p in pairs:
+        rb, rc, rl = read_stats(raw_p)
+        ib, ic, il = read_stats(ipa_p)
+        stats.append(
+            PairStats(
+                lang=lang,
+                raw_path=raw_p,
+                ipa_path=ipa_p,
+                raw_bytes=rb,
+                ipa_bytes=ib,
+                raw_chars=rc,
+                ipa_chars=ic,
+                raw_lines=rl,
+                ipa_lines=il,
+            )
+        )
+
+    # Sorting
+    key_map = {
+        "lang": lambda s: s.lang,
+        "raw_bytes": lambda s: s.raw_bytes,
+        "ipa_bytes": lambda s: s.ipa_bytes,
+        "ratio": lambda s: s.ratio_bytes,
+        "delta": lambda s: s.delta_bytes,
+    }
+    stats.sort(key=key_map[args.sort])
+
+    outdir = Path(args.outdir)
+    if args.save:
+        outdir.mkdir(parents=True, exist_ok=True)
+
+    make_scatter(
+        stats,
+        outdir / "scatter_ipa_vs_raw_bytes.png" if args.save else None,
+        args.title,
+    )
+
+    make_bar(
+        stats,
+        [s.ratio_bytes for s in stats],
+        "IPA / Raw (bytes)",
+        outdir / "bar_ratio_ipa_over_raw.png" if args.save else None,
+        "IPA expansion ratio by language",
+    )
+
+    make_bar(
+        stats,
+        [float(s.delta_bytes) for s in stats],
+        "IPA - Raw (bytes)",
+        outdir / "bar_delta_ipa_minus_raw.png" if args.save else None,
+        "Absolute size increase (IPA − Raw)",
+    )
+
+    make_back_to_back_bar(
+            stats,
+            outdir / "bar_back_to_back_raw_vs_ipa.png" if args.save else None,
+            title="Raw vs IPA Text Size by Language (UTF-8 bytes)",
+    )
+
+
+    if args.save and args.csv:
+        write_csv(stats, outdir / "ipa_vs_raw_stats.csv")
+
+    for s in stats:
+        print(
+            f"{s.lang:14s} raw={s.raw_bytes:8d} "
+            f"ipa={s.ipa_bytes:8d} "
+            f"ratio={s.ratio_bytes:6.3f} "
+            f"delta={s.delta_bytes:8d}"
+        )
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/data/flores200-res/plot_multi_script_languages.py b/data/flores200-res/plot_multi_script_languages.py
new file mode 100644
index 0000000000..cc50455a26
--- /dev/null
+++ b/data/flores200-res/plot_multi_script_languages.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""
+plot_multi_script_languages.py
+
+Plot ONLY languages that appear in multiple scripts, grouped by language,
+with one bar per (language, script) variant, and bars colored by script.
+
+Fixed color mapping requested:
+  - Latn => blue
+  - Arab => green
+  - Deva => orange
+All other scripts get deterministic fallback colors from matplotlib's cycle.
+
+Input JSON format: list of dicts containing at least:
+  {"size":"383K", "name":"text_kas_Arab.txt", ...}
+
+Usage:
+  python3 plot_multi_script_languages.py --json files.json
+  python3 plot_multi_script_languages.py --json files.json --out multi_script.png
+  python3 plot_multi_script_languages.py --min-scripts 2 --sort total_kb
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+from collections import Counter, defaultdict
+from typing import Dict, List, Tuple
+
+import matplotlib.pyplot as plt
+
+# Expected filename: text_<lang>_<script>.txt
+FNAME_RE = re.compile(r"^text_([a-z]{3})_([A-Za-z]{4})\.txt$")
+
+
+def parse_size_to_kb(size_str: str) -> float:
+    """
+    Convert ls -h-ish sizes to KB (float):
+      "383K" -> 383
+      "1.2M" -> 1228.8
+      "900"  -> treated as bytes -> 0.8789 KB (rare)
+    """
+    m = re.match(r"^\s*([0-9]*\.?[0-9]+)\s*([KMGTP]?)(B?)\s*$", size_str, re.IGNORECASE)
+    if not m:
+        raise ValueError(f"Unrecognized size string: {size_str!r}")
+
+    val = float(m.group(1))
+    unit = m.group(2).upper()
+
+    mult = {
+        "": 1.0 / 1024.0,  # bytes -> KB
+        "K": 1.0,
+        "M": 1024.0,
+        "G": 1024.0**2,
+        "T": 1024.0**3,
+        "P": 1024.0**4,
+    }[unit]
+    return val * mult
+
+
+def script_to_fixed_color(script: str) -> str | None:
+    """
+    Fixed, semantically meaningful colors.
+    Using Matplotlib's classic hexes for consistent look.
+    """
+    fixed = {
+        "Latn": "#1f77b4",  # blue
+        "Arab": "#2ca02c",  # green
+        "Deva": "#ff7f0e",  # orange
+    }
+    return fixed.get(script)
+
+
+def get_fallback_palette() -> List[str]:
+    palette = plt.rcParams.get("axes.prop_cycle", None)
+    if palette is not None:
+        colors = palette.by_key().get("color", [])
+        if colors:
+            return list(colors)
+    # last-resort fallback
+    return ["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9"]
+
+
+def build_script_color_map(scripts_in_use: List[str]) -> Dict[str, str]:
+    """
+    Deterministic mapping:
+      - apply fixed colors first
+      - remaining scripts assigned in sorted order from fallback palette
+    """
+    palette = get_fallback_palette()
+    out: Dict[str, str] = {}
+
+    # fixed first
+    remaining = []
+    for s in scripts_in_use:
+        fx = script_to_fixed_color(s)
+        if fx is not None:
+            out[s] = fx
+        else:
+            remaining.append(s)
+
+    # deterministic assignment for remaining scripts
+    remaining_sorted = sorted(set(remaining))
+    for i, s in enumerate(remaining_sorted):
+        out[s] = palette[i % len(palette)]
+
+    return out
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--json", default="files.json", help="Input JSON listing")
+    ap.add_argument("--out", default=None, help="If set, save figure here (png/pdf/etc)")
+    ap.add_argument(
+        "--min-scripts",
+        type=int,
+        default=2,
+        help="Keep languages with >= this many distinct scripts (default 2)",
+    )
+    ap.add_argument(
+        "--sort",
+        choices=["total_kb", "lang"],
+        default="total_kb",
+        help="Order language blocks by total size or alphabetically",
+    )
+    ap.add_argument(
+        "--top-langs",
+        type=int,
+        default=0,
+        help="If >0, keep only top N multi-script languages by total KB",
+    )
+    args = ap.parse_args()
+
+    with open(args.json, "r", encoding="utf-8") as f:
+        rows = json.load(f)
+
+    # Aggregate KB by (lang, script)
+    lang_script_kb: Counter[Tuple[str, str]] = Counter()
+    lang_to_scripts: Dict[str, set] = defaultdict(set)
+
+    for r in rows:
+        name = r.get("name", "")
+        m = FNAME_RE.match(name)
+        if not m:
+            continue
+        lang, script = m.groups()
+        kb = parse_size_to_kb(str(r["size"]))
+        lang_script_kb[(lang, script)] += kb
+        lang_to_scripts[lang].add(script)
+
+    # Identify languages that appear in multiple scripts
+    multi_langs = [lang for lang, scripts in lang_to_scripts.items() if len(scripts) >= args.min_scripts]
+    if not multi_langs:
+        raise SystemExit("No multi-script languages found in this files.json (given --min-scripts).")
+
+    # Total KB per language (for sorting)
+    lang_total_kb: Dict[str, float] = {}
+    for lang in multi_langs:
+        lang_total_kb[lang] = sum(lang_script_kb[(lang, s)] for s in lang_to_scripts[lang])
+
+    # Sort language blocks
+    if args.sort == "total_kb":
+        multi_langs.sort(key=lambda l: lang_total_kb[l], reverse=True)
+    else:
+        multi_langs.sort()
+
+    # Optional: top N languages
+    if args.top_langs and args.top_langs > 0:
+        multi_langs = multi_langs[: args.top_langs]
+
+    # Scripts used among these multi-script languages
+    scripts_in_use: List[str] = []
+    for lang in multi_langs:
+        for s in lang_to_scripts[lang]:
+            if s not in scripts_in_use:
+                scripts_in_use.append(s)
+
+    script_color = build_script_color_map(scripts_in_use)
+
+    # Build plotting rows: one bar per (lang, script), grouped by lang
+    y_labels: List[str] = []
+    x_vals: List[float] = []
+    colors: List[str] = []
+
+    # For labeling language blocks and separator lines
+    lang_midpoints: List[Tuple[str, float]] = []
+    separators: List[int] = []
+    y_pos = 0
+
+    for lang in multi_langs:
+        scripts = sorted(lang_to_scripts[lang], key=lambda s: lang_script_kb[(lang, s)], reverse=True)
+        start = y_pos
+
+        for s in scripts:
+            y_labels.append(f"{lang}_{s}")
+            x_vals.append(lang_script_kb[(lang, s)])
+            colors.append(script_color.get(s, "C0"))
+            y_pos += 1
+
+        end = y_pos
+        lang_midpoints.append((lang, (start + end - 1) / 2))
+        separators.append(end)
+
+    # Plot
+    fig_h = max(5, 0.35 * len(y_labels) + 1.5)
+    plt.figure(figsize=(12, fig_h))
+    plt.barh(range(len(y_labels)), x_vals, color=colors)
+    plt.yticks(range(len(y_labels)), y_labels)
+    plt.xlabel("Total size (KB)")
+    plt.title("Multi-script languages: grouped by language, colored by script")
+
+    # Separators between language blocks
+    for cut in separators[:-1]:
+        plt.axhline(cut - 0.5, linewidth=1)
+
+    # Language labels on left, centered per block
+    for lang, mid in lang_midpoints:
+        plt.text(
+            0,
+            mid,
+            f"  {lang}",
+            va="center",
+            ha="left",
+            fontsize=10,
+            transform=plt.gca().get_yaxis_transform(),  # x in axes coords, y in data coords
+        )
+
+    # Legend (scripts), deterministic order: fixed first, then alphabetical
+    fixed_order = ["Latn", "Arab", "Deva"]
+    rest = sorted([s for s in script_color.keys() if s not in fixed_order])
+    legend_scripts = [s for s in fixed_order if s in script_color] + rest
+
+    handles = [
+        plt.Line2D([0], [0], marker="s", linestyle="", color=script_color[s], label=s)
+        for s in legend_scripts
+    ]
+    plt.legend(handles=handles, title="Script", loc="lower right")
+
+    plt.tight_layout()
+    if args.out:
+        plt.savefig(args.out, dpi=200)
+    else:
+        plt.show()
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/data/flores200-res/plot_tokenization_vs_original.py b/data/flores200-res/plot_tokenization_vs_original.py
new file mode 100644
index 0000000000..d282fb6549
--- /dev/null
+++ b/data/flores200-res/plot_tokenization_vs_original.py
@@ -0,0 +1,360 @@
+#!/usr/bin/env python3
+"""
+plot_tokenization_vs_original.py
+
+Reads the *annotated* filtered JSON (from tokenize_and_annotate_sizes.py),
+and plots either:
+
+  (A) tokenized size in KB (default: tiktoken), OR
+  (B) ratio of tokenized/original (default), OR
+  (C) ratio of original/tokenized (optional)
+
+in the SAME grouped order + SAME color semantics as your previous plots.
+
+Grouping options:
+  --group-by {region, script, family}
+Coloring options:
+  --color-by {region, script, family} (default: same as group-by)
+
+If you color by script:
+  - Latn = blue
+  - Arab = green
+  - Deva = orange
+  - others fall back deterministically to matplotlib cycle
+
+Input entry format expected (per row):
+{
+  "language": "ace",
+  "script": "Latn",
+  "lang_script": "ace_Latn",
+  "size_kb": 277.0,
+  "tokenized_sizes": {"tiktoken": 300.0, "byte": 277.0},
+  "filename": "text_ace_Latn.txt"
+}
+
+Examples:
+  # plot tokenized KB (tiktoken) grouped by region
+  python3 plot_tokenization_vs_original.py --json filtered_scripts.json --mode tokenized_kb --method tiktoken
+
+  # plot ratio (tiktoken/original) grouped by family, colored by family
+  python3 plot_tokenization_vs_original.py --json filtered_scripts.json --mode ratio --method tiktoken --group-by family --color-by family
+
+  # plot ratio (original/tiktoken) grouped by script, colored by script
+  python3 plot_tokenization_vs_original.py --mode ratio --ratio-kind orig_over_tok --group-by script --color-by script
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from collections import defaultdict
+from typing import Any, Dict, List, Tuple
+
+import matplotlib.pyplot as plt
+
+
+def script_to_region(script: str) -> str:
+    mena = {"Arab", "Hebr"}
+    south_asia = {"Deva", "Beng", "Gujr", "Guru", "Orya", "Knda", "Mlym"}
+    east_asia = {"Jpan", "Hang", "Hani", "Hans", "Hant"}
+    se_asia = {"Khmr", "Laoo", "Mymr", "Thai"}
+    eurasia = {"Cyrl", "Grek", "Armn", "Geor"}
+    horn_africa = {"Ethi"}
+    himalaya = {"Tibt"}
+
+    if script in mena:
+        return "MENA"
+    if script in south_asia:
+        return "South Asia"
+    if script in east_asia:
+        return "East Asia"
+    if script in se_asia:
+        return "Southeast Asia"
+    if script in eurasia:
+        return "Eurasia"
+    if script in horn_africa:
+        return "Horn of Africa"
+    if script in himalaya:
+        return "Himalaya"
+    if script == "Latn":
+        return "Latin (global)"
+    return f"Other ({script})"
+
+
+def script_to_family(script: str) -> str:
+    # Coarser “language grouping” you asked for earlier
+    semitic_scripts = {"Arab", "Hebr", "Ethi"}  # per your earlier preference
+    han_scripts = {"Hans", "Hant", "Hani"}
+    japanese = {"Jpan"}
+    korean = {"Hang"}
+    indic = {"Deva", "Beng", "Gujr", "Guru", "Orya", "Knda", "Mlym", "Taml", "Telu", "Sinh"}
+    se_asia = {"Khmr", "Laoo", "Mymr", "Thai"}
+    cyrillic = {"Cyrl"}
+    greek = {"Grek"}
+    caucasus = {"Armn", "Geor"}
+    tibetan = {"Tibt"}
+
+    if script in semitic_scripts:
+        return "Semitic scripts (Arab/Hebr/Ethi)"
+    if script in han_scripts:
+        return "Han scripts (Hans/Hant/Hani)"
+    if script in japanese:
+        return "Japanese (Jpan)"
+    if script in korean:
+        return "Korean (Hang)"
+    if script in indic:
+        return "Indic scripts"
+    if script in se_asia:
+        return "Mainland SEA scripts"
+    if script in cyrillic:
+        return "Cyrillic"
+    if script in greek:
+        return "Greek"
+    if script in caucasus:
+        return "Caucasus scripts (Armn/Geor)"
+    if script in tibetan:
+        return "Tibetan (Tibt)"
+    if script == "Latn":
+        return "Latin"
+    return f"Other ({script})"
+
+
+def script_to_fixed_color(script: str) -> str | None:
+    # fixed colors you wanted
+    fixed = {
+        "Latn": "#1f77b4",  # blue
+        "Arab": "#2ca02c",  # green
+        "Deva": "#ff7f0e",  # orange
+    }
+    return fixed.get(script)
+
+
+def get_palette() -> List[str]:
+    colors = plt.rcParams.get("axes.prop_cycle", None)
+    if colors is not None:
+        arr = colors.by_key().get("color", [])
+        if arr:
+            return list(arr)
+    return ["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9"]
+
+
+def build_color_map(labels_in_order: List[str], mode: str, script_for_key: Dict[str, str]) -> Dict[str, str]:
+    """
+    Build a deterministic color map for color-by labels.
+
+    If color mode is 'script', apply fixed mapping for Latn/Arab/Deva.
+    Otherwise use palette in first-seen order.
+    """
+    palette = get_palette()
+    out: Dict[str, str] = {}
+
+    if mode == "script":
+        # fixed first for known scripts
+        # labels_in_order here are script names (e.g. Latn, Arab)
+        fallback_i = 0
+        for lab in labels_in_order:
+            fx = script_to_fixed_color(lab)
+            if fx is not None:
+                out[lab] = fx
+            else:
+                out[lab] = palette[fallback_i % len(palette)]
+                fallback_i += 1
+        return out
+
+    # region/family: first-seen order
+    for i, lab in enumerate(labels_in_order):
+        out[lab] = palette[i % len(palette)]
+    return out
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--json", default="filtered_scripts.json", help="Annotated filtered json")
+    ap.add_argument("--out", default=None, help="Save plot to this path (png/pdf/etc)")
+
+    ap.add_argument("--method", default="tiktoken",
+                    help="Which tokenization in tokenized_sizes to use (default: tiktoken)")
+    ap.add_argument("--mode", choices=["tokenized_kb", "ratio"], default="ratio",
+                    help="Plot tokenized KB or ratio relative to original")
+    ap.add_argument("--ratio-kind", choices=["tok_over_orig", "orig_over_tok"], default="tok_over_orig",
+                    help="If --mode ratio: which ratio to plot")
+
+    ap.add_argument("--group-by", choices=["region", "script", "family"], default="region",
+                    help="How to group entries on the Y axis (blocks)")
+    ap.add_argument("--color-by", choices=["region", "script", "family"], default=None,
+                    help="How to color bars (default: same as group-by)")
+    ap.add_argument("--top-n", type=int, default=0,
+                    help="If >0, plot only top N rows by plotted value within the overall list")
+
+    ap.add_argument("--skip-missing", action="store_true",
+                    help="Skip rows missing tokenized_sizes[method] (default: skip anyway, but quieter)")
+    ap.add_argument("--epsilon", type=float, default=1e-9,
+                    help="Small value to avoid divide-by-zero if size_kb is 0")
+
+    args = ap.parse_args()
+    if args.color_by is None:
+        args.color_by = args.group_by
+
+    with open(args.json, "r", encoding="utf-8") as f:
+        rows: List[Dict[str, Any]] = json.load(f)
+
+    # Build per-row derived fields
+    items: List[Tuple[str, float, str, str, str]] = []
+    # tuple: (lang_script, value, script, region, family)
+    missing = 0
+
+    for r in rows:
+        lang_script = r.get("lang_script")
+        script = r.get("script")
+        if not lang_script or not script:
+            # allow reconstruction if needed
+            lang = r.get("language")
+            if lang and script:
+                lang_script = f"{lang}_{script}"
+            else:
+                continue
+
+        original_kb = float(r.get("size_kb", 0.0))
+        tok_map = r.get("tokenized_sizes", {})
+        tok_kb = None
+        if isinstance(tok_map, dict):
+            v = tok_map.get(args.method)
+            if v is not None:
+                tok_kb = float(v)
+
+        if tok_kb is None:
+            missing += 1
+            if not args.skip_missing:
+                pass
+            continue
+
+        if args.mode == "tokenized_kb":
+            value = tok_kb
+        else:
+            denom = original_kb if args.ratio_kind == "tok_over_orig" else tok_kb
+            num = tok_kb if args.ratio_kind == "tok_over_orig" else original_kb
+            value = num / max(denom, args.epsilon)
+
+        region = script_to_region(script)
+        family = script_to_family(script)
+        items.append((lang_script, value, script, region, family))
+
+    if not items:
+        raise SystemExit(f"No rows had tokenized_sizes['{args.method}']. Missing={missing}")
+
+    # Group label helpers
+    def pick_label(which: str, script: str, region: str, family: str) -> str:
+        if which == "script":
+            return script
+        if which == "region":
+            return region
+        if which == "family":
+            return family
+        raise ValueError(which)
+
+    # Group into blocks, order blocks by total value, and within block by value desc
+    groups = defaultdict(list)  # label -> list[(lang_script, value, script, region, family)]
+    for (ls, val, sc, reg, fam) in items:
+        g = pick_label(args.group_by, sc, reg, fam)
+        groups[g].append((ls, val, sc, reg, fam))
+
+    group_order = sorted(groups.keys(), key=lambda g: sum(x[1] for x in groups[g]), reverse=True)
+
+    ordered: List[Tuple[str, float, str, str, str]] = []
+    for g in group_order:
+        ordered.extend(sorted(groups[g], key=lambda x: x[1], reverse=True))
+
+    # Optional top-n by plotted value (after global ordering)
+    if args.top_n and args.top_n > 0:
+        ordered = ordered[: args.top_n]
+
+    # Build color label list in first-seen order (for deterministic legend mapping)
+    color_labels_seen: List[str] = []
+    script_for_key: Dict[str, str] = {}
+    ordered_color_labels: List[str] = []
+
+    for (ls, val, sc, reg, fam) in ordered:
+        cl = pick_label(args.color_by, sc, reg, fam)
+        ordered_color_labels.append(cl)
+        if cl not in color_labels_seen:
+            color_labels_seen.append(cl)
+        script_for_key[ls] = sc
+
+    color_map = build_color_map(color_labels_seen, args.color_by, script_for_key)
+    bar_colors = [color_map[cl] for cl in ordered_color_labels]
+
+    # Plot
+    labels = [x[0] for x in ordered]
+    values = [x[1] for x in ordered]
+    scripts = [x[2] for x in ordered]
+    regions = [x[3] for x in ordered]
+    families = [x[4] for x in ordered]
+
+    fig_h = max(6, 0.28 * len(labels) + 1.5)
+    plt.figure(figsize=(12, fig_h))
+
+    y = range(len(labels))
+    plt.barh(y, values, color=bar_colors)
+    plt.yticks(y, labels)
+
+    if args.mode == "tokenized_kb":
+        plt.xlabel(f"Tokenized size (KB): {args.method}")
+        title = f"Tokenized size (KB) | method={args.method} | grouped by {args.group_by} | colored by {args.color_by}"
+    else:
+        if args.ratio_kind == "tok_over_orig":
+            plt.xlabel(f"Ratio: {args.method} / original (KB)")
+        else:
+            plt.xlabel(f"Ratio: original / {args.method} (KB)")
+        title = f"Tokenization ratio | method={args.method} | grouped by {args.group_by} | colored by {args.color_by}"
+
+    plt.title(title)
+
+    # Add group separators + group labels (same style as before)
+    # Recompute block boundaries based on the currently plotted rows
+    def row_group_label(i: int) -> str:
+        sc, reg, fam = scripts[i], regions[i], families[i]
+        return pick_label(args.group_by, sc, reg, fam)
+
+    starts: List[Tuple[str, int, int]] = []  # (g, start, end)
+    if labels:
+        cur_g = row_group_label(0)
+        start = 0
+        for i in range(1, len(labels)):
+            g = row_group_label(i)
+            if g != cur_g:
+                starts.append((cur_g, start, i))
+                cur_g = g
+                start = i
+        starts.append((cur_g, start, len(labels)))
+
+    for (g, s, e) in starts:
+        if s > 0:
+            plt.axhline(s - 0.5, linewidth=1)
+        mid = (s + e - 1) / 2
+        plt.text(
+            0,
+            mid,
+            f"  {g}",
+            va="center",
+            ha="left",
+            fontsize=10,
+            transform=plt.gca().get_yaxis_transform(),
+        )
+
+    # Legend
+    handles = [
+        plt.Line2D([0], [0], marker="s", linestyle="", color=color_map[lab], label=lab)
+        for lab in color_labels_seen
+    ]
+    plt.legend(handles=handles, title=args.color_by, loc="lower right")
+
+    plt.tight_layout()
+    if args.out:
+        plt.savefig(args.out, dpi=200)
+    else:
+        plt.show()
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/data/flores200-res/tokenization_vs_origina.sh b/data/flores200-res/tokenization_vs_origina.sh
new file mode 100644
index 0000000000..03b7ab75ac
--- /dev/null
+++ b/data/flores200-res/tokenization_vs_origina.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+python3 plot_tokenization_vs_original.py \
+  --json filtered_files.json \
+  --mode ratio \
+  --method tiktoken \
+  --group-by family --color-by family \
+  --out ratio_family.png
+
+python3 plot_tokenization_vs_original.py \
+  --json filtered_files.json \
+  --mode tokenized_kb --method tiktoken \
+  --group-by region --color-by region \
+  --out tok_kb_region.png
+
+python3 plot_tokenization_vs_original.py \
+  --json filtered_files.json \
+  --mode ratio --method tiktoken \
+  --group-by script --color-by script \
+  --out ratio_script.png
+
diff --git a/data/flores200-res/tokenize.sh b/data/flores200-res/tokenize.sh
new file mode 100644
index 0000000000..634bf10554
--- /dev/null
+++ b/data/flores200-res/tokenize.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+python3 tokenize_and_annotate_sizes.py \
+  --in-json filtered_files.json \
+  --method tiktoken \
+  --tiktoken-encoding gpt2
+
diff --git a/data/flores200-res/tokenize_and_annotate_sizes.py b/data/flores200-res/tokenize_and_annotate_sizes.py
new file mode 100644
index 0000000000..152c9599a8
--- /dev/null
+++ b/data/flores200-res/tokenize_and_annotate_sizes.py
@@ -0,0 +1,254 @@
+#!/usr/bin/env python3
+"""
+tokenize_and_annotate_sizes.py
+
+Reads the *filtered* JSON produced by filter_files_by_script.py (one entry per file),
+runs prepare.py (assumed symlinked as ./prepare.py) to tokenize each text file with
+100% train split (no val), writes:
+
+  text_<lang>_<script>_<tokenization_type>.bin
+
+and then appends/updates:
+
+  entry["tokenized_sizes"][<tokenization_type>] = <size_kb_of_bin>
+
+Example output entry:
+{
+  "language": "ace",
+  "script": "Latn",
+  "lang_script": "ace_Latn",
+  "size_kb": 277.0,
+  "tokenized_sizes": {"tiktoken": 300.0},
+  "filename": "text_ace_Latn.txt"
+}
+
+Notes:
+- prepare.py writes meta.pkl in the current directory and overwrites it each run.
+- This script runs tokenization sequentially to avoid meta.pkl races.
+- Uses --percentage_train 1.0 to ensure 100% of the file is tokenized and no val is written.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, List
+
+
+def file_size_kb(path: Path) -> float:
+    return path.stat().st_size / 1024.0
+
+
+def run_prepare(
+    prepare_path: Path,
+    input_txt: Path,
+    out_bin: Path,
+    method: str,
+    tiktoken_encoding: str,
+    additional_tokens_file: str | None,
+    extra_args: List[str],
+) -> None:
+    cmd = [
+        "python3",
+        str(prepare_path),
+        "--method",
+        method,
+        "-t",
+        str(input_txt),
+        "--train_output",
+        str(out_bin),
+        "--percentage_train",
+        "1.0",
+    ]
+
+    # Only pass tiktoken args when relevant
+    if method == "tiktoken":
+        cmd += ["--tiktoken_encoding", tiktoken_encoding]
+        if additional_tokens_file:
+            cmd += ["--additional_tokens_file", additional_tokens_file]
+
+    # Allow power-users to append any extra prepare.py args
+    cmd += extra_args
+
+    subprocess.run(cmd, check=True)
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+
+    ap.add_argument(
+        "--in-json",
+        default="filtered_scripts.json",
+        help="Input JSON from filter_files_by_script.py",
+    )
+    ap.add_argument(
+        "--out-json",
+        default=None,
+        help="Output JSON (default: overwrite --in-json)",
+    )
+    ap.add_argument(
+        "--prepare",
+        default="./prepare.py",
+        help="Path to prepare.py (symlink in cwd is fine)",
+    )
+    ap.add_argument(
+        "--base-dir",
+        default=".",
+        help="Directory where the text_*.txt files live (default: cwd)",
+    )
+
+    # Tokenization selection (start with tiktoken, but allow switching)
+    ap.add_argument(
+        "--method",
+        choices=[
+            "tiktoken",
+            "sentencepiece",
+            "char",
+            "custom",
+            "byte",
+            "custom_char_byte_fallback",
+            "json_byte_fallback",
+            "python_programming",
+            "sinewave",
+        ],
+        default="tiktoken",
+        help="Tokenizer method to run via prepare.py",
+    )
+
+    # tiktoken-specific knobs (ignored for other methods)
+    ap.add_argument(
+        "--tiktoken-encoding",
+        choices=["gpt2", "r50k_base", "p50k_base", "cl100k_base"],
+        default="gpt2",
+        help="tiktoken encoding (only used if --method tiktoken)",
+    )
+    ap.add_argument(
+        "--additional-tokens-file",
+        default=None,
+        help="JSON file of additional special tokens for tiktoken (only used if --method tiktoken)",
+    )
+
+    ap.add_argument(
+        "--force",
+        action="store_true",
+        help="Re-run tokenization even if the output .bin already exists",
+    )
+    ap.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print what would run, but do not execute prepare.py or write json",
+    )
+
+    # Pass-through to prepare.py (optional)
+    ap.add_argument(
+        "--prepare-extra-args",
+        nargs=argparse.REMAINDER,
+        default=[],
+        help="Everything after this flag is passed to prepare.py verbatim. "
+             "Example: --prepare-extra-args -T",
+    )
+
+    args = ap.parse_args()
+
+    in_json = Path(args.in_json)
+    out_json = Path(args.out_json) if args.out_json else in_json
+    prepare_path = Path(args.prepare)
+    base_dir = Path(args.base_dir)
+
+    if not prepare_path.exists():
+        raise SystemExit(f"prepare.py not found at: {prepare_path}")
+    if not in_json.exists():
+        raise SystemExit(f"Input JSON not found: {in_json}")
+
+    rows: List[Dict[str, Any]]
+    with in_json.open("r", encoding="utf-8") as f:
+        rows = json.load(f)
+    if not isinstance(rows, list):
+        raise SystemExit("Expected input JSON to be a list of objects")
+
+    method = args.method
+
+    for entry in rows:
+        # expected from your filtered file:
+        # language, script, lang_script, size_kb, filename (optional)
+        filename = entry.get("filename")
+        if not filename:
+            # If filename was dropped, reconstruct from lang/script
+            lang = entry["language"]
+            script = entry["script"]
+            filename = f"text_{lang}_{script}.txt"
+
+        input_txt = base_dir / filename
+        if not input_txt.exists():
+            # skip missing files (common if base_dir wrong)
+            print(f"[skip] missing input: {input_txt}")
+            continue
+
+        # Output: text_<lang>_<script>_<method>.bin
+        lang = entry["language"]
+        script = entry["script"]
+        out_bin = base_dir / f"text_{lang}_{script}_{method}.bin"
+
+        # Ensure tokenized_sizes map exists
+        tok_sizes = entry.get("tokenized_sizes")
+        if not isinstance(tok_sizes, dict):
+            tok_sizes = {}
+            entry["tokenized_sizes"] = tok_sizes
+
+        if out_bin.exists() and not args.force:
+            # already computed? still record size in json
+            kb = file_size_kb(out_bin)
+            tok_sizes[method] = kb
+            print(f"[reuse] {out_bin.name} {kb:.1f} KB")
+            continue
+
+        cmd_preview = f"python3 {prepare_path} --method {method} -t {input_txt} --train_output {out_bin} --percentage_train 1.0"
+        if method == "tiktoken":
+            cmd_preview += f" --tiktoken_encoding {args.tiktoken_encoding}"
+            if args.additional_tokens_file:
+                cmd_preview += f" --additional_tokens_file {args.additional_tokens_file}"
+        if args.prepare_extra_args:
+            cmd_preview += " " + " ".join(args.prepare_extra_args)
+
+        if args.dry_run:
+            print(f"[dry-run] {cmd_preview}")
+            continue
+
+        print(f"[run] {cmd_preview}")
+        try:
+            run_prepare(
+                prepare_path=prepare_path,
+                input_txt=input_txt,
+                out_bin=out_bin,
+                method=method,
+                tiktoken_encoding=args.tiktoken_encoding,
+                additional_tokens_file=args.additional_tokens_file,
+                extra_args=args.prepare_extra_args,
+            )
+        except subprocess.CalledProcessError as e:
+            print(f"[error] tokenization failed for {input_txt.name} ({method}): {e}")
+            continue
+
+        if not out_bin.exists():
+            print(f"[error] expected output missing: {out_bin}")
+            continue
+
+        kb = file_size_kb(out_bin)
+        tok_sizes[method] = kb
+        print(f"[ok] {out_bin.name} {kb:.1f} KB")
+
+    if args.dry_run:
+        return
+
+    # Write updated JSON
+    with out_json.open("w", encoding="utf-8") as f:
+        json.dump(rows, f, indent=2, ensure_ascii=False)
+    print(f"[done] wrote updated json: {out_json}")
+
+
+if __name__ == "__main__":
+    main()
+

From 7449141d0062dfbe65314984e3fbc4d3d06ad1b7 Mon Sep 17 00:00:00 2001
From: klei22 <kauna@avalonwest.tech>
Date: Sat, 3 Jan 2026 13:05:38 -0800
Subject: [PATCH 08/10] Update ipa visualizations

---
 data/flores200-res/filtered_files.json        |  20 +
 data/flores200-res/ipa_scripts.sh             |  20 +
 data/flores200-res/plot_ipa_vs_text.py        | 208 +++++++-
 .../flores200-res/spm_vocab_freq_dashboard.py | 462 ++++++++++++++++++
 .../tokenize_and_annotate_sizes.py            |   2 +-
 5 files changed, 688 insertions(+), 24 deletions(-)
 create mode 100644 data/flores200-res/ipa_scripts.sh
 create mode 100644 data/flores200-res/spm_vocab_freq_dashboard.py

diff --git a/data/flores200-res/filtered_files.json b/data/flores200-res/filtered_files.json
index 8370ba3c16..c79266cebf 100644
--- a/data/flores200-res/filtered_files.json
+++ b/data/flores200-res/filtered_files.json
@@ -1488,5 +1488,25 @@
     "tokenized_sizes": {
       "tiktoken": 259.185546875
     }
+  },
+  {
+    "language": "vie",
+    "script": "Latn",
+    "lang_script": "vie_Latn",
+    "size_kb": 360.0,
+    "filename": "text_vie_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 470.21875
+    }
+  },
+  {
+    "language": "swh",
+    "script": "Latn",
+    "lang_script": "swh_Latn",
+    "size_kb": 272.0,
+    "filename": "text_swh_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 223.5703125
+    }
   }
 ]
\ No newline at end of file
diff --git a/data/flores200-res/ipa_scripts.sh b/data/flores200-res/ipa_scripts.sh
new file mode 100644
index 0000000000..4b004aa1a6
--- /dev/null
+++ b/data/flores200-res/ipa_scripts.sh
@@ -0,0 +1,20 @@
+# include tokenized comparison (uses tokenized_sizes["tiktoken"] from filtered_scripts.json)
+python3 plot_ipa_vs_text.py \
+  --text-dir text --ipa-dir ipa \
+  --filtered-json filtered_files.json \
+  --tok-method tiktoken
+
+# # save everything to plots_out/
+# python3 plot_ipa_vs_text.py \
+#   --text-dir text --ipa-dir ipa \
+#   --filtered-json filtered_scripts.json \
+#   --tok-method tiktoken \
+#   --save --outdir plots_out --csv
+
+# only keep languages that have tiktoken sizes
+# python3 plot_ipa_vs_text.py \
+#   --text-dir text --ipa-dir ipa \
+#   --filtered-json filtered_tiles.json \
+#   --tok-method tiktoken \
+#   --skip-missing-tok
+
diff --git a/data/flores200-res/plot_ipa_vs_text.py b/data/flores200-res/plot_ipa_vs_text.py
index 07aeb52519..5c0e246466 100644
--- a/data/flores200-res/plot_ipa_vs_text.py
+++ b/data/flores200-res/plot_ipa_vs_text.py
@@ -11,10 +11,25 @@
   --text-dir text/
   --ipa-dir  ipa/
 
-Produces:
+Tokenization:
+- Can also load tokenized sizes (e.g. tiktoken) from filtered_scripts.json (or other)
+  produced by your tokenize_and_annotate_sizes.py pipeline, and plot:
+
+    raw_bytes vs ipa_bytes vs tok_bytes
+
+Assumptions for filtered JSON rows:
+  - list[dict]
+  - key "lang_script" OR ("language"+"_"+"script") matches the <lang> part of text_<lang>.txt
+  - key "tokenized_sizes" is a dict like {"tiktoken": <KB float>, ...}
+
+Produces (same as before):
 - scatter: IPA bytes vs raw bytes
 - bar: IPA/raw ratio
 - bar: delta bytes (IPA - raw)
+
+Additionally (if filtered json provided & matches are found):
+- grouped bar: Raw vs IPA vs Tokenized (bytes) per language
+
 """
 
 from __future__ import annotations
@@ -24,6 +39,7 @@
 from dataclasses import dataclass
 from pathlib import Path
 import math
+import json
 
 from typing import Dict, List, Optional, Tuple
 
@@ -41,6 +57,7 @@ class PairStats:
     ipa_chars: int
     raw_lines: int
     ipa_lines: int
+    tok_bytes: Optional[int] = None  # NEW
 
     @property
     def ratio_bytes(self) -> float:
@@ -67,6 +84,8 @@ def discover_pairs(text_dir: Path, ipa_dir: Path) -> List[Tuple[str, Path, Path]
     Finds pairs across directories:
       text_dir/text_<lang>.txt
       ipa_dir/ipa_text_<lang>.txt
+
+    Note: <lang> can be "eng_Latn" etc; we treat it as an opaque key.
     """
     raw_map: Dict[str, Path] = {}
     ipa_map: Dict[str, Path] = {}
@@ -85,6 +104,56 @@ def discover_pairs(text_dir: Path, ipa_dir: Path) -> List[Tuple[str, Path, Path]
     return [(lang, raw_map[lang], ipa_map[lang]) for lang in langs]
 
 
+def _load_tokenized_kb_map(filtered_json: Path, method: str) -> Dict[str, float]:
+    """
+    Returns: { lang_script_key -> tokenized_size_kb } for the chosen method.
+
+    Expects rows like:
+      {
+        "lang_script": "eng_Latn",
+        "tokenized_sizes": {"tiktoken": 300.0},
+        ...
+      }
+    """
+    if not filtered_json.exists():
+        raise FileNotFoundError(f"filtered json not found: {filtered_json}")
+
+    rows = json.loads(filtered_json.read_text(encoding="utf-8"))
+    if not isinstance(rows, list):
+        raise ValueError("filtered json must be a list of objects")
+
+    out: Dict[str, float] = {}
+    for r in rows:
+        if not isinstance(r, dict):
+            continue
+
+        key = r.get("lang_script")
+        if not key:
+            # try reconstruct
+            lang = r.get("language")
+            script = r.get("script")
+            if lang and script:
+                key = f"{lang}_{script}"
+
+        if not key:
+            continue
+
+        tok_map = r.get("tokenized_sizes")
+        if not isinstance(tok_map, dict):
+            continue
+
+        v = tok_map.get(method)
+        if v is None:
+            continue
+
+        try:
+            out[str(key)] = float(v)  # KB
+        except Exception:
+            continue
+
+    return out
+
+
 def make_scatter(stats: List[PairStats], outpath: Optional[Path], title: str) -> None:
     x = [s.raw_bytes for s in stats]
     y = [s.ipa_bytes for s in stats]
@@ -132,7 +201,6 @@ def make_bar(
         plt.show()
 
 
-
 def _mean_std(vals: List[float]) -> Tuple[float, float]:
     """
     Population mean/std (ddof=0) over vals.
@@ -145,10 +213,10 @@ def _mean_std(vals: List[float]) -> Tuple[float, float]:
 
 
 def make_back_to_back_bar(
-        stats: List[PairStats],
-        outpath: Optional[Path],
-        title: str = "Raw vs IPA Text Size (UTF-8 bytes)",
-        ) -> None:
+    stats: List[PairStats],
+    outpath: Optional[Path],
+    title: str = "Raw vs IPA Text Size (UTF-8 bytes)",
+) -> None:
     """
     Back-to-back horizontal bar chart:
         - Raw text on the left (negative)
@@ -182,29 +250,21 @@ def make_back_to_back_bar(
     plt.axvline(ipa_mean, linestyle=":", linewidth=2, label=f"IPA mean ({ipa_mean:.0f})")
 
     # ±1 stddev lines (dotted, lighter)
-    # Raw side
     plt.axvline(-(raw_mean - raw_std), linestyle=":", linewidth=1)
     plt.axvline(-(raw_mean + raw_std), linestyle=":", linewidth=1)
-    # IPA side
     plt.axvline(ipa_mean - ipa_std, linestyle=":", linewidth=1)
     plt.axvline(ipa_mean + ipa_std, linestyle=":", linewidth=1)
 
     plt.xlabel("UTF-8 bytes")
     plt.title(
-            f"{title}\n"
-            f"Raw mean={raw_mean:.0f}, std={raw_std:.0f} | "
-            f"IPA mean={ipa_mean:.0f}, std={ipa_std:.0f}"
-            )
+        f"{title}\n"
+        f"Raw mean={raw_mean:.0f}, std={raw_std:.0f} | "
+        f"IPA mean={ipa_mean:.0f}, std={ipa_std:.0f}"
+    )
     plt.grid(True, axis="x", linestyle="--", linewidth=0.5)
 
-    # Symmetric x-limits
     max_val = max(max(ipa_vals), max(abs(v) for v in raw_vals))
-    # also include mean±std in bounds
-    max_val = max(
-            max_val,
-            raw_mean + raw_std,
-            ipa_mean + ipa_std,
-            )
+    max_val = max(max_val, raw_mean + raw_std, ipa_mean + ipa_std)
     plt.xlim(-max_val * 1.15, max_val * 1.15)
 
     plt.legend(loc="best")
@@ -216,10 +276,60 @@ def make_back_to_back_bar(
     else:
         plt.show()
 
-    # Optional: print summary to console (handy)
     print(f"[back-to-back] Raw bytes: mean={raw_mean:.2f}, std={raw_std:.2f}")
     print(f"[back-to-back] IPA bytes: mean={ipa_mean:.2f}, std={ipa_std:.2f}")
 
+
+def make_grouped_raw_ipa_tok(
+    stats: List[PairStats],
+    outpath: Optional[Path],
+    tok_label: str,
+    title: str = "Raw vs IPA vs Tokenized Size (UTF-8 bytes)",
+) -> None:
+    """
+    Grouped (clustered) vertical bar chart per language:
+      raw, ipa, tok (if present)
+
+    If some rows are missing tok_bytes, we simply omit that bar for that language.
+    """
+    langs = [s.lang for s in stats]
+    raw = [s.raw_bytes for s in stats]
+    ipa = [s.ipa_bytes for s in stats]
+    tok = [s.tok_bytes for s in stats]  # Optional[int]
+
+    x = list(range(len(langs)))
+    width = 0.25
+
+    plt.figure(figsize=(max(10, 0.9 * len(langs)), 5))
+
+    # raw and ipa always present
+    plt.bar([i - width for i in x], raw, width=width, label="Raw")
+    plt.bar([i for i in x], ipa, width=width, label="IPA")
+
+    # tokenized: only where present
+    tok_x = []
+    tok_y = []
+    for i, v in enumerate(tok):
+        if v is not None:
+            tok_x.append(i + width)
+            tok_y.append(v)
+    if tok_x:
+        plt.bar(tok_x, tok_y, width=width, label=tok_label)
+
+    plt.xticks(x, langs, rotation=35, ha="right")
+    plt.ylabel("Bytes (UTF-8)")
+    plt.title(title)
+    plt.grid(True, axis="y", linestyle="--", linewidth=0.5)
+    plt.legend(loc="best")
+
+    plt.tight_layout()
+    if outpath:
+        plt.savefig(outpath, dpi=200)
+        plt.close()
+    else:
+        plt.show()
+
+
 def write_csv(stats: List[PairStats], out_csv: Path) -> None:
     out_csv.parent.mkdir(parents=True, exist_ok=True)
     with out_csv.open("w", newline="", encoding="utf-8") as f:
@@ -237,6 +347,7 @@ def write_csv(stats: List[PairStats], out_csv: Path) -> None:
                 "ipa_chars",
                 "raw_lines",
                 "ipa_lines",
+                "tok_bytes",
             ]
         )
         for s in stats:
@@ -253,6 +364,7 @@ def write_csv(stats: List[PairStats], out_csv: Path) -> None:
                     s.ipa_chars,
                     s.raw_lines,
                     s.ipa_lines,
+                    "" if s.tok_bytes is None else s.tok_bytes,
                 ]
             )
 
@@ -275,6 +387,26 @@ def main() -> None:
         default="IPA vs Raw Text Size (UTF-8 bytes)",
         help="Title for scatter plot.",
     )
+
+    # NEW: load tokenized sizes (KB) from filtered json and compare in bytes
+    ap.add_argument(
+        "--filtered-json",
+        default=None,
+        help="Optional: filtered_scripts.json (annotated) containing tokenized_sizes (KB). "
+             "If set, we will add a Raw vs IPA vs Tokenized plot.",
+    )
+    ap.add_argument(
+        "--tok-method",
+        default="tiktoken",
+        help="Which tokenized_sizes[method] to load from filtered json (default: tiktoken).",
+    )
+    ap.add_argument(
+        "--skip-missing-tok",
+        action="store_true",
+        help="If set, drop languages that don't have tokenized_sizes[tok-method]. "
+             "Default: keep language but omit tok bar.",
+    )
+
     args = ap.parse_args()
 
     text_dir = Path(args.text_dir)
@@ -289,10 +421,25 @@ def main() -> None:
     if not pairs:
         raise SystemExit("No matching text_/ipa_text_ pairs found.")
 
+    tok_kb_map: Dict[str, float] = {}
+    if args.filtered_json:
+        tok_kb_map = _load_tokenized_kb_map(Path(args.filtered_json), method=args.tok_method)
+
     stats: List[PairStats] = []
     for lang, raw_p, ipa_p in pairs:
         rb, rc, rl = read_stats(raw_p)
         ib, ic, il = read_stats(ipa_p)
+
+        tok_bytes: Optional[int] = None
+        if tok_kb_map:
+            kb = tok_kb_map.get(lang)
+            if kb is not None:
+                tok_bytes = int(round(kb * 1024.0))
+
+        # optionally drop missing tokenized values
+        if args.skip_missing_tok and tok_kb_map and tok_bytes is None:
+            continue
+
         stats.append(
             PairStats(
                 lang=lang,
@@ -304,6 +451,7 @@ def main() -> None:
                 ipa_chars=ic,
                 raw_lines=rl,
                 ipa_lines=il,
+                tok_bytes=tok_bytes,
             )
         )
 
@@ -344,19 +492,33 @@ def main() -> None:
     )
 
     make_back_to_back_bar(
-            stats,
-            outdir / "bar_back_to_back_raw_vs_ipa.png" if args.save else None,
-            title="Raw vs IPA Text Size by Language (UTF-8 bytes)",
+        stats,
+        outdir / "bar_back_to_back_raw_vs_ipa.png" if args.save else None,
+        title="Raw vs IPA Text Size by Language (UTF-8 bytes)",
     )
 
+    # NEW: grouped raw vs ipa vs tokenized (if filtered_json provided and any matches exist)
+    if tok_kb_map:
+        any_tok = any(s.tok_bytes is not None for s in stats)
+        if any_tok:
+            make_grouped_raw_ipa_tok(
+                stats,
+                outdir / f"bar_grouped_raw_ipa_{args.tok_method}.png" if args.save else None,
+                tok_label=args.tok_method,
+                title=f"Raw vs IPA vs {args.tok_method} (bytes)",
+            )
+        else:
+            print(f"[warn] --filtered-json provided but no tokenized_sizes['{args.tok_method}'] matched your lang keys.")
 
     if args.save and args.csv:
         write_csv(stats, outdir / "ipa_vs_raw_stats.csv")
 
     for s in stats:
+        tok_str = "n/a" if s.tok_bytes is None else str(s.tok_bytes)
         print(
             f"{s.lang:14s} raw={s.raw_bytes:8d} "
             f"ipa={s.ipa_bytes:8d} "
+            f"tok({args.tok_method})={tok_str:>8s} "
             f"ratio={s.ratio_bytes:6.3f} "
             f"delta={s.delta_bytes:8d}"
         )
diff --git a/data/flores200-res/spm_vocab_freq_dashboard.py b/data/flores200-res/spm_vocab_freq_dashboard.py
new file mode 100644
index 0000000000..f07da105e9
--- /dev/null
+++ b/data/flores200-res/spm_vocab_freq_dashboard.py
@@ -0,0 +1,462 @@
+#!/usr/bin/env python3
+"""
+spm_vocab_freq_dashboard.py
+
+Build a single self-contained HTML dashboard (Plotly + vanilla JS) that shows:
+
+LEFT:
+  - SentencePiece vocab tokens + total frequency across *all* .txt files in a directory
+  - searchable dropdown to pick a token (and optional click-to-select from table)
+
+RIGHT:
+  - per-file counts for the currently-selected token (bar chart)
+  - updates live in the same HTML (no server)
+
+Why we require a .model:
+  - SentencePiece tokenization is not plain substring matching; to get true token frequencies,
+    we MUST encode text using the SentencePiece model.
+
+Defaults:
+  --vocab trained_spm_model.vocab
+  --model inferred by replacing ".vocab" -> ".model" if not provided
+  --dir   required
+
+Output:
+  vocab_freq_dashboard.html (or --out)
+
+Example:
+  python3 spm_vocab_freq_dashboard.py --dir ./text --vocab trained_spm_model.vocab
+  python3 spm_vocab_freq_dashboard.py --dir ./text --model trained_spm_model.model --top-k 2000
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from collections import Counter
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import sentencepiece as spm
+
+
+def infer_model_path_from_vocab(vocab_path: Path) -> Path:
+    # trained_spm_model.vocab -> trained_spm_model.model
+    if vocab_path.suffix.lower() == ".vocab":
+        return vocab_path.with_suffix(".model")
+    # fallback: append .model
+    return Path(str(vocab_path) + ".model")
+
+
+def iter_text_files(root: Path, recursive: bool, suffixes: Tuple[str, ...]) -> List[Path]:
+    if recursive:
+        it = root.rglob("*")
+    else:
+        it = root.glob("*")
+    files = []
+    for p in it:
+        if p.is_file() and p.suffix.lower() in suffixes:
+            files.append(p)
+    files.sort()
+    return files
+
+
+def count_tokens_in_file(sp: spm.SentencePieceProcessor, path: Path) -> Counter:
+    """
+    Streaming-ish: encode line by line to avoid loading huge files into memory.
+    """
+    c = Counter()
+    with path.open("r", encoding="utf-8", errors="replace") as f:
+        for line in f:
+            ids = sp.encode(line, out_type=int)
+            c.update(ids)
+    return c
+
+
+def human_token(tok: str) -> str:
+    # Make SentencePiece boundary visible and avoid crazy HTML rendering.
+    # Keep it readable: ▁ (U+2581) is the "word boundary" marker in SPM.
+    return tok.replace("\t", " ").replace("\n", "\\n")
+
+
+def build_html(
+    title: str,
+    token_rows: List[Dict],
+    per_file_counts: Dict[str, Dict[str, int]],
+    file_order: List[str],
+    default_token_id: int,
+    out_path: Path,
+) -> None:
+    """
+    token_rows: list of dicts for top-k tokens: {id, token, count}
+    per_file_counts: { token_id(str) -> { file_name -> count } } only for tokens we embed
+    file_order: stable order of files for bar chart
+    """
+    payload = {
+        "title": title,
+        "tokens": token_rows,
+        "per_file": per_file_counts,
+        "files": file_order,
+        "default_token_id": default_token_id,
+    }
+
+    # NOTE: This uses Plotly CDN for a "dynamic" interactive page without extra deps.
+    # If you need fully offline HTML (no CDN), we can embed plotly.min.js, but the file is huge.
+    html = f"""<!doctype html>
+<html>
+<head>
+  <meta charset="utf-8" />
+  <title>{title}</title>
+  <script src="https://cdn.plot.ly/plotly-2.30.0.min.js"></script>
+  <style>
+    body {{
+      font-family: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, "Noto Sans", Arial, sans-serif;
+      margin: 0; padding: 0;
+    }}
+    header {{
+      padding: 12px 16px;
+      border-bottom: 1px solid #ddd;
+      display: flex;
+      gap: 12px;
+      align-items: center;
+      flex-wrap: wrap;
+    }}
+    header h1 {{
+      font-size: 16px;
+      margin: 0;
+      flex: 1;
+    }}
+    .container {{
+      display: flex;
+      height: calc(100vh - 58px);
+      gap: 10px;
+      padding: 10px;
+      box-sizing: border-box;
+    }}
+    .panel {{
+      flex: 1;
+      min-width: 380px;
+      border: 1px solid #ddd;
+      border-radius: 10px;
+      padding: 10px;
+      box-sizing: border-box;
+      overflow: hidden;
+      display: flex;
+      flex-direction: column;
+    }}
+    .panel h2 {{
+      font-size: 14px;
+      margin: 0 0 8px 0;
+    }}
+    .controls {{
+      display: flex;
+      gap: 8px;
+      align-items: center;
+      flex-wrap: wrap;
+      margin-bottom: 8px;
+    }}
+    .controls label {{
+      font-size: 12px;
+      color: #333;
+    }}
+    select, input {{
+      font-size: 12px;
+      padding: 6px 8px;
+      border-radius: 8px;
+      border: 1px solid #ccc;
+      outline: none;
+    }}
+    #tableDiv {{
+      flex: 1;
+      min-height: 200px;
+    }}
+    #barDiv {{
+      flex: 1;
+      min-height: 200px;
+    }}
+    .note {{
+      font-size: 12px;
+      color: #555;
+      padding: 8px 0 0 0;
+    }}
+    .muted {{
+      color: #777;
+    }}
+  </style>
+</head>
+<body>
+<header>
+  <h1>{title}</h1>
+  <span class="muted">Click a row (or use dropdown) to update per-file counts.</span>
+</header>
+
+<div class="container">
+  <div class="panel">
+    <h2>Vocab + total frequency (directory aggregate)</h2>
+    <div class="controls">
+      <label for="tokenSelect">Token:</label>
+      <select id="tokenSelect"></select>
+      <label for="searchBox">Search:</label>
+      <input id="searchBox" type="text" placeholder="type to filter tokens..." />
+    </div>
+    <div id="tableDiv"></div>
+    <div class="note">
+      Showing <b>top {len(token_rows)}</b> tokens by frequency.
+    </div>
+  </div>
+
+  <div class="panel">
+    <h2 id="rightTitle">Per-file counts</h2>
+    <div id="barDiv"></div>
+    <div class="note">
+      Bars show token count per file (same tokenization as training).
+    </div>
+  </div>
+</div>
+
+<script>
+const DATA = {json.dumps(payload, ensure_ascii=False)};
+
+function fmtTokenRow(t) {{
+  // Make whitespace visible-ish in dropdown
+  let s = t.token;
+  // show the word-boundary marker as "▁" (already is), but keep readable
+  if (s.length > 60) s = s.slice(0, 57) + "…";
+  return `${{t.id}}: ${{s}} (${{t.count}})`;
+}}
+
+function buildSelectOptions(tokens) {{
+  const sel = document.getElementById("tokenSelect");
+  sel.innerHTML = "";
+  for (const t of tokens) {{
+    const opt = document.createElement("option");
+    opt.value = String(t.id);
+    opt.textContent = fmtTokenRow(t);
+    sel.appendChild(opt);
+  }}
+}}
+
+function filterTokens(tokens, q) {{
+  if (!q) return tokens;
+  q = q.toLowerCase();
+  return tokens.filter(t => String(t.id).includes(q) || (t.token || "").toLowerCase().includes(q));
+}}
+
+function renderTable(tokens) {{
+  // Plotly table
+  const ids = tokens.map(t => t.id);
+  const toks = tokens.map(t => t.token);
+  const counts = tokens.map(t => t.count);
+
+  const tableData = [{{
+    type: "table",
+    header: {{
+      values: ["<b>ID</b>", "<b>Token</b>", "<b>Total Count</b>"],
+      align: ["right", "left", "right"],
+    }},
+    cells: {{
+      values: [ids, toks, counts],
+      align: ["right", "left", "right"],
+      height: 22
+    }}
+  }}];
+
+  const layout = {{
+    margin: {{l: 10, r: 10, t: 10, b: 10}},
+  }};
+
+  Plotly.newPlot("tableDiv", tableData, layout, {{displayModeBar: false}});
+
+  // Click-to-select token: for tables, plotly_click gives pointNumber (row index)
+  const tableDiv = document.getElementById("tableDiv");
+  tableDiv.on("plotly_click", (ev) => {{
+    try {{
+      const row = ev.points[0].pointNumber;
+      const tok = tokens[row];
+      if (tok) {{
+        selectToken(String(tok.id), true);
+      }}
+    }} catch (e) {{}}
+  }});
+}}
+
+function renderBar(tokenId) {{
+  const tok = DATA.tokens.find(t => String(t.id) === String(tokenId));
+  const name = tok ? tok.token : `(id=${{tokenId}})`;
+
+  const per = DATA.per_file[String(tokenId)] || {{}};
+  const xs = DATA.files.slice();
+  const ys = xs.map(fn => (per[fn] || 0));
+
+  const trace = {{
+    type: "bar",
+    x: xs,
+    y: ys
+  }};
+
+  const layout = {{
+    margin: {{l: 50, r: 10, t: 30, b: 120}},
+    xaxis: {{
+      tickangle: 35,
+      automargin: true
+    }},
+    yaxis: {{
+      title: "Count"
+    }},
+    title: `Token: ${{name}} (id=${{tokenId}})`
+  }};
+
+  Plotly.newPlot("barDiv", [trace], layout, {{displayModeBar: true}});
+  document.getElementById("rightTitle").textContent = "Per-file counts";
+}}
+
+function selectToken(tokenId, updateSelect) {{
+  if (updateSelect) {{
+    const sel = document.getElementById("tokenSelect");
+    sel.value = String(tokenId);
+  }}
+  renderBar(tokenId);
+}}
+
+function init() {{
+  buildSelectOptions(DATA.tokens);
+
+  // Default selection
+  const sel = document.getElementById("tokenSelect");
+  sel.value = String(DATA.default_token_id);
+  renderTable(DATA.tokens);
+  renderBar(DATA.default_token_id);
+
+  sel.addEventListener("change", (e) => {{
+    selectToken(e.target.value, false);
+  }});
+
+  const search = document.getElementById("searchBox");
+  search.addEventListener("input", (e) => {{
+    const q = e.target.value || "";
+    const filtered = filterTokens(DATA.tokens, q);
+
+    // update dropdown to filtered list, but keep current selection if still present
+    const cur = document.getElementById("tokenSelect").value;
+    buildSelectOptions(filtered);
+
+    // if current selection is in filtered list, keep it; else select first
+    const hasCur = filtered.some(t => String(t.id) === String(cur));
+    const newId = hasCur ? cur : (filtered.length ? String(filtered[0].id) : String(DATA.default_token_id));
+    document.getElementById("tokenSelect").value = newId;
+
+    // rerender table with filtered tokens
+    renderTable(filtered);
+
+    // update bar based on dropdown selection
+    selectToken(newId, false);
+  }});
+}}
+
+init();
+</script>
+</body>
+</html>
+"""
+    out_path.write_text(html, encoding="utf-8")
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--vocab", default="trained_spm_model.vocab",
+                    help="SentencePiece vocab file (default: trained_spm_model.vocab). Used to infer .model if --model not given.")
+    ap.add_argument("--model", default=None,
+                    help="SentencePiece model file (.model). If omitted, inferred from --vocab by replacing .vocab -> .model.")
+    ap.add_argument("--dir", required=True,
+                    help="Directory of text files to scan.")
+    ap.add_argument("--recursive", action="store_true",
+                    help="Recurse into subdirectories (default: false).")
+    ap.add_argument("--suffixes", default=".txt",
+                    help="Comma-separated suffixes to include (default: .txt). Example: .txt,.md")
+    ap.add_argument("--top-k", type=int, default=1500,
+                    help="Embed only top-K tokens by total frequency into the HTML for interactivity (default: 1500).")
+    ap.add_argument("--out", default="vocab_freq_dashboard.html",
+                    help="Output HTML path (default: vocab_freq_dashboard.html)")
+    ap.add_argument("--min-count", type=int, default=1,
+                    help="Only consider tokens with total count >= this (default: 1).")
+    args = ap.parse_args()
+
+    vocab_path = Path(args.vocab)
+    model_path = Path(args.model) if args.model else infer_model_path_from_vocab(vocab_path)
+    root = Path(args.dir)
+    out = Path(args.out)
+
+    if not model_path.exists():
+        raise SystemExit(f"SentencePiece model not found: {model_path} (pass --model or ensure it matches --vocab)")
+    if not root.exists() or not root.is_dir():
+        raise SystemExit(f"Directory not found: {root}")
+
+    suffixes = tuple(s.strip().lower() for s in args.suffixes.split(",") if s.strip())
+    files = iter_text_files(root, recursive=args.recursive, suffixes=suffixes)
+    if not files:
+        raise SystemExit(f"No files found in {root} with suffixes={suffixes} (try --recursive or --suffixes)")
+
+    sp = spm.SentencePieceProcessor(model_file=str(model_path))
+    vocab_size = sp.get_piece_size()
+
+    print(f"[info] model: {model_path}")
+    print(f"[info] vocab size: {vocab_size}")
+    print(f"[info] scanning {len(files)} files under: {root}")
+
+    total = Counter()
+    per_file: Dict[str, Counter] = {}
+    file_names: List[str] = []
+
+    for p in files:
+        rel = str(p.relative_to(root))
+        file_names.append(rel)
+        c = count_tokens_in_file(sp, p)
+        per_file[rel] = c
+        total.update(c)
+
+    # Build top tokens (by total frequency)
+    items = [(tid, cnt) for tid, cnt in total.items() if cnt >= args.min_count]
+    items.sort(key=lambda x: x[1], reverse=True)
+
+    if not items:
+        raise SystemExit(f"No tokens met min-count={args.min_count} (unexpected).")
+
+    top_items = items[: max(1, args.top_k)]
+
+    token_rows: List[Dict] = []
+    # per_file_counts: token_id(str) -> file -> count  (only for embedded tokens)
+    per_file_counts: Dict[str, Dict[str, int]] = {}
+
+    for tid, cnt in top_items:
+        tok = human_token(sp.id_to_piece(int(tid)))
+        token_rows.append({"id": int(tid), "token": tok, "count": int(cnt)})
+
+    # Build per-file map for embedded tokens
+    for tid, _ in top_items:
+        tid = int(tid)
+        k = str(tid)
+        per_file_counts[k] = {}
+        for fn in file_names:
+            v = per_file[fn].get(tid, 0)
+            if v:
+                per_file_counts[k][fn] = int(v)
+
+    default_token_id = int(top_items[0][0])
+
+    title = f"SentencePiece token frequency dashboard ({root.name})"
+    build_html(
+        title=title,
+        token_rows=token_rows,
+        per_file_counts=per_file_counts,
+        file_order=file_names,
+        default_token_id=default_token_id,
+        out_path=out,
+    )
+
+    print(f"[done] wrote: {out}")
+    print(f"[note] Uses Plotly CDN for interactivity; open the HTML in your browser.")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/data/flores200-res/tokenize_and_annotate_sizes.py b/data/flores200-res/tokenize_and_annotate_sizes.py
index 152c9599a8..e39d834899 100644
--- a/data/flores200-res/tokenize_and_annotate_sizes.py
+++ b/data/flores200-res/tokenize_and_annotate_sizes.py
@@ -81,7 +81,7 @@ def main() -> None:
 
     ap.add_argument(
         "--in-json",
-        default="filtered_scripts.json",
+        default="filtered_files.json",
         help="Input JSON from filter_files_by_script.py",
     )
     ap.add_argument(

From 686f2b2b5affba3402d9ba1c8da0363ef895adae Mon Sep 17 00:00:00 2001
From: klei22 <kauna@avalonwest.tech>
Date: Sat, 3 Jan 2026 13:05:48 -0800
Subject: [PATCH 09/10] Add .gitignore

---
 data/flores200-res/.gitignore | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 data/flores200-res/.gitignore

diff --git a/data/flores200-res/.gitignore b/data/flores200-res/.gitignore
new file mode 100644
index 0000000000..e33609d251
--- /dev/null
+++ b/data/flores200-res/.gitignore
@@ -0,0 +1 @@
+*.png

From 375e5b945908c1ceacb8a0f50fc0006fdfae2662 Mon Sep 17 00:00:00 2001
From: klei22 <kauna@avalonwest.tech>
Date: Sat, 3 Jan 2026 19:10:11 -0800
Subject: [PATCH 10/10] Add updates to latest scripts

---
 data/flores200-res/ipa_scripts.sh             |  11 +-
 .../flores200-res/spm_vocab_freq_dashboard.py | 338 ++++++++++++++----
 2 files changed, 276 insertions(+), 73 deletions(-)

diff --git a/data/flores200-res/ipa_scripts.sh b/data/flores200-res/ipa_scripts.sh
index 4b004aa1a6..ead45ced52 100644
--- a/data/flores200-res/ipa_scripts.sh
+++ b/data/flores200-res/ipa_scripts.sh
@@ -1,8 +1,8 @@
 # include tokenized comparison (uses tokenized_sizes["tiktoken"] from filtered_scripts.json)
-python3 plot_ipa_vs_text.py \
-  --text-dir text --ipa-dir ipa \
-  --filtered-json filtered_files.json \
-  --tok-method tiktoken
+# python3 plot_ipa_vs_text.py \
+#   --text-dir text --ipa-dir ipa \
+#   --filtered-json filtered_files.json \
+#   --tok-method tiktoken
 
 # # save everything to plots_out/
 # python3 plot_ipa_vs_text.py \
@@ -18,3 +18,6 @@ python3 plot_ipa_vs_text.py \
 #   --tok-method tiktoken \
 #   --skip-missing-tok
 
+
+python3 plot_ipa_vs_text.py --text-dir text --ipa-dir ipa --save --outdir plots_out --csv
+
diff --git a/data/flores200-res/spm_vocab_freq_dashboard.py b/data/flores200-res/spm_vocab_freq_dashboard.py
index f07da105e9..5a0bb941d5 100644
--- a/data/flores200-res/spm_vocab_freq_dashboard.py
+++ b/data/flores200-res/spm_vocab_freq_dashboard.py
@@ -2,18 +2,22 @@
 """
 spm_vocab_freq_dashboard.py
 
-Build a single self-contained HTML dashboard (Plotly + vanilla JS) that shows:
+Single-script, self-contained HTML dashboard (Plotly + vanilla JS) that shows:
 
 LEFT:
-  - SentencePiece vocab tokens + total frequency across *all* .txt files in a directory
-  - searchable dropdown to pick a token (and optional click-to-select from table)
+  - SentencePiece vocab tokens + total frequency across *all* text files in a directory
+  - searchable dropdown to pick a token
+  - click-to-select from the token table
 
-RIGHT:
-  - per-file counts for the currently-selected token (bar chart)
-  - updates live in the same HTML (no server)
+RIGHT (top):
+  - per-file counts for the selected token (bar chart)
+
+RIGHT (bottom):
+  - square similarity heatmap clustering text files by similarity across high-frequency vocab
+    (cosine similarity over TF-IDF on top vocab tokens)
 
 Why we require a .model:
-  - SentencePiece tokenization is not plain substring matching; to get true token frequencies,
+  - SentencePiece tokenization is not substring matching; to get true token frequencies,
     we MUST encode text using the SentencePiece model.
 
 Defaults:
@@ -25,8 +29,8 @@
   vocab_freq_dashboard.html (or --out)
 
 Example:
-  python3 spm_vocab_freq_dashboard.py --dir ./text --vocab trained_spm_model.vocab
-  python3 spm_vocab_freq_dashboard.py --dir ./text --model trained_spm_model.model --top-k 2000
+  python3 spm_vocab_freq_dashboard.py --dir ./text --vocab trained_spm_model.vocab --heatmap
+  python3 spm_vocab_freq_dashboard.py --dir ./text --heatmap-top-k 500 --recursive
 """
 
 from __future__ import annotations
@@ -35,24 +39,26 @@
 import json
 from collections import Counter
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Tuple, Optional
 
 import sentencepiece as spm
 
+# For heatmap similarity: NumPy required (SciPy not needed)
+try:
+    import numpy as np
+except Exception as e:
+    np = None
+    _NUMPY_IMPORT_ERROR = e
+
 
 def infer_model_path_from_vocab(vocab_path: Path) -> Path:
-    # trained_spm_model.vocab -> trained_spm_model.model
     if vocab_path.suffix.lower() == ".vocab":
         return vocab_path.with_suffix(".model")
-    # fallback: append .model
     return Path(str(vocab_path) + ".model")
 
 
 def iter_text_files(root: Path, recursive: bool, suffixes: Tuple[str, ...]) -> List[Path]:
-    if recursive:
-        it = root.rglob("*")
-    else:
-        it = root.glob("*")
+    it = root.rglob("*") if recursive else root.glob("*")
     files = []
     for p in it:
         if p.is_file() and p.suffix.lower() in suffixes:
@@ -74,34 +80,179 @@ def count_tokens_in_file(sp: spm.SentencePieceProcessor, path: Path) -> Counter:
 
 
 def human_token(tok: str) -> str:
-    # Make SentencePiece boundary visible and avoid crazy HTML rendering.
-    # Keep it readable: ▁ (U+2581) is the "word boundary" marker in SPM.
     return tok.replace("\t", " ").replace("\n", "\\n")
 
 
+def _build_tfidf_matrix(
+    file_names: List[str],
+    per_file: Dict[str, Counter],
+    token_ids: List[int],
+) -> "np.ndarray":
+    """
+    docs x tokens TF-IDF, L2-normalized per doc
+    """
+    assert np is not None
+    n_docs = len(file_names)
+    n_tok = len(token_ids)
+    if n_docs == 0 or n_tok == 0:
+        return np.zeros((n_docs, n_tok), dtype=np.float32)
+
+    tok_to_col = {tid: j for j, tid in enumerate(token_ids)}
+
+    # document frequency
+    df = np.zeros((n_tok,), dtype=np.int32)
+    for fn in file_names:
+        c = per_file[fn]
+        for tid in c.keys():
+            j = tok_to_col.get(tid)
+            if j is not None:
+                df[j] += 1
+
+    # smooth idf
+    idf = np.log((n_docs + 1.0) / (df.astype(np.float32) + 1.0)) + 1.0
+
+    X = np.zeros((n_docs, n_tok), dtype=np.float32)
+    for i, fn in enumerate(file_names):
+        c = per_file[fn]
+        row = X[i]
+        for tid, cnt in c.items():
+            j = tok_to_col.get(tid)
+            if j is not None:
+                row[j] = float(cnt)
+
+        row *= idf
+
+        # L2 normalize
+        norm = float(np.linalg.norm(row))
+        if norm > 0:
+            row /= norm
+
+    return X
+
+
+def _cosine_similarity_matrix(X: "np.ndarray") -> "np.ndarray":
+    """
+    X assumed rows L2-normalized; cosine similarity = X @ X.T
+    """
+    assert np is not None
+    if X.size == 0:
+        return np.zeros((X.shape[0], X.shape[0]), dtype=np.float32)
+    S = X @ X.T
+    # numerical guard
+    S = np.clip(S, -1.0, 1.0).astype(np.float32)
+    # make diagonal exactly 1
+    n = S.shape[0]
+    for i in range(n):
+        S[i, i] = 1.0
+    return S
+
+
+def _order_by_simple_clustering(S: "np.ndarray") -> List[int]:
+    """
+    Optional: reorder files so similar ones are near each other, without SciPy.
+    Greedy "nearest neighbor chain" heuristic:
+      - start from most "central" (max average similarity)
+      - repeatedly append most similar unused item to the last
+    """
+    assert np is not None
+    n = S.shape[0]
+    if n <= 2:
+        return list(range(n))
+
+    avg = S.mean(axis=1)
+    start = int(np.argmax(avg))
+    order = [start]
+    used = set(order)
+
+    while len(order) < n:
+        last = order[-1]
+        # pick unused with max similarity to last
+        best_j = None
+        best_val = -1e9
+        for j in range(n):
+            if j in used:
+                continue
+            v = float(S[last, j])
+            if v > best_val:
+                best_val = v
+                best_j = j
+        order.append(int(best_j))
+        used.add(int(best_j))
+
+    return order
+
+
+def _build_heatmap_payload(
+    file_names: List[str],
+    per_file: Dict[str, Counter],
+    token_ids_for_heatmap: List[int],
+    reorder: bool,
+) -> Dict:
+    """
+    Returns plotly-ready payload for similarity heatmap.
+    """
+    if np is None:
+        raise RuntimeError(
+            "NumPy is required for heatmap mode.\n"
+            f"Import error: {_NUMPY_IMPORT_ERROR!r}\n"
+            "Install: python3 -m pip install numpy"
+        )
+
+    if len(file_names) < 2:
+        return {"ok": False, "reason": "Need at least 2 files to build a similarity heatmap."}
+
+    X = _build_tfidf_matrix(file_names, per_file, token_ids_for_heatmap)
+    S = _cosine_similarity_matrix(X)
+
+    idx = list(range(len(file_names)))
+    if reorder:
+        idx = _order_by_simple_clustering(S)
+
+    labels = [file_names[i] for i in idx]
+    S2 = S[np.ix_(idx, idx)]
+
+    # convert to nested lists for JSON
+    z = S2.tolist()
+
+    traces = [{
+        "type": "heatmap",
+        "z": z,
+        "x": labels,
+        "y": labels,
+        "zmin": 0.0,
+        "zmax": 1.0,
+        "hovertemplate": "x=%{x}<br>y=%{y}<br>cosine=%{z:.3f}<extra></extra>",
+        # no explicit colorscale specified (Plotly default) to match your earlier “no custom colors” vibe
+    }]
+
+    layout = {
+        "margin": {"l": 120, "r": 20, "t": 40, "b": 120},
+        "title": "File similarity heatmap (TF-IDF on high-freq SPM vocab, cosine similarity)",
+        "xaxis": {"tickangle": 35, "automargin": True},
+        "yaxis": {"automargin": True},
+    }
+
+    return {"ok": True, "traces": traces, "layout": layout}
+
+
 def build_html(
     title: str,
     token_rows: List[Dict],
     per_file_counts: Dict[str, Dict[str, int]],
     file_order: List[str],
     default_token_id: int,
+    heatmap_payload: Optional[Dict],
     out_path: Path,
 ) -> None:
-    """
-    token_rows: list of dicts for top-k tokens: {id, token, count}
-    per_file_counts: { token_id(str) -> { file_name -> count } } only for tokens we embed
-    file_order: stable order of files for bar chart
-    """
     payload = {
         "title": title,
         "tokens": token_rows,
         "per_file": per_file_counts,
         "files": file_order,
         "default_token_id": default_token_id,
+        "heatmap": heatmap_payload,
     }
 
-    # NOTE: This uses Plotly CDN for a "dynamic" interactive page without extra deps.
-    # If you need fully offline HTML (no CDN), we can embed plotly.min.js, but the file is huge.
     html = f"""<!doctype html>
 <html>
 <head>
@@ -170,10 +321,21 @@ def build_html(
       flex: 1;
       min-height: 200px;
     }}
+    .rightCharts {{
+      display: flex;
+      flex-direction: column;
+      gap: 10px;
+      flex: 1;
+      overflow: hidden;
+    }}
     #barDiv {{
       flex: 1;
       min-height: 200px;
     }}
+    #heatDiv {{
+      flex: 1;
+      min-height: 260px;
+    }}
     .note {{
       font-size: 12px;
       color: #555;
@@ -187,7 +349,7 @@ def build_html(
 <body>
 <header>
   <h1>{title}</h1>
-  <span class="muted">Click a row (or use dropdown) to update per-file counts.</span>
+  <span class="muted">Pick a token to update per-file counts; heatmap shows file similarity via high-frequency vocab.</span>
 </header>
 
 <div class="container">
@@ -206,10 +368,13 @@ def build_html(
   </div>
 
   <div class="panel">
-    <h2 id="rightTitle">Per-file counts</h2>
-    <div id="barDiv"></div>
+    <h2 id="rightTitle">Per-file counts + similarity heatmap</h2>
+    <div class="rightCharts">
+      <div id="barDiv"></div>
+      <div id="heatDiv"></div>
+    </div>
     <div class="note">
-      Bars show token count per file (same tokenization as training).
+      Heatmap uses <b>TF-IDF</b> over high-frequency SentencePiece tokens and <b>cosine similarity</b>.
     </div>
   </div>
 </div>
@@ -218,9 +383,7 @@ def build_html(
 const DATA = {json.dumps(payload, ensure_ascii=False)};
 
 function fmtTokenRow(t) {{
-  // Make whitespace visible-ish in dropdown
   let s = t.token;
-  // show the word-boundary marker as "▁" (already is), but keep readable
   if (s.length > 60) s = s.slice(0, 57) + "…";
   return `${{t.id}}: ${{s}} (${{t.count}})`;
 }}
@@ -243,7 +406,6 @@ def build_html(
 }}
 
 function renderTable(tokens) {{
-  // Plotly table
   const ids = tokens.map(t => t.id);
   const toks = tokens.map(t => t.token);
   const counts = tokens.map(t => t.count);
@@ -261,13 +423,10 @@ def build_html(
     }}
   }}];
 
-  const layout = {{
+  Plotly.newPlot("tableDiv", tableData, {{
     margin: {{l: 10, r: 10, t: 10, b: 10}},
-  }};
-
-  Plotly.newPlot("tableDiv", tableData, layout, {{displayModeBar: false}});
+  }}, {{displayModeBar: false}});
 
-  // Click-to-select token: for tables, plotly_click gives pointNumber (row index)
   const tableDiv = document.getElementById("tableDiv");
   tableDiv.on("plotly_click", (ev) => {{
     try {{
@@ -288,32 +447,52 @@ def build_html(
   const xs = DATA.files.slice();
   const ys = xs.map(fn => (per[fn] || 0));
 
-  const trace = {{
+  Plotly.newPlot("barDiv", [{{
     type: "bar",
     x: xs,
     y: ys
-  }};
-
-  const layout = {{
+  }}], {{
     margin: {{l: 50, r: 10, t: 30, b: 120}},
-    xaxis: {{
-      tickangle: 35,
-      automargin: true
-    }},
-    yaxis: {{
-      title: "Count"
-    }},
+    xaxis: {{ tickangle: 35, automargin: true }},
+    yaxis: {{ title: "Count" }},
     title: `Token: ${{name}} (id=${{tokenId}})`
-  }};
+  }}, {{displayModeBar: true}});
+}}
+
+function renderHeatmap() {{
+  const h = DATA.heatmap;
+  const div = document.getElementById("heatDiv");
+
+  if (!h) {{
+    Plotly.newPlot(div, [], {{
+      margin: {{l: 20, r: 10, t: 30, b: 30}},
+      title: "Similarity heatmap: not computed",
+      annotations: [{{
+        text: "No heatmap payload present.",
+        xref: "paper", yref: "paper", x: 0.5, y: 0.5, showarrow: false
+      }}]
+    }}, {{displayModeBar: false}});
+    return;
+  }}
+
+  if (!h.ok) {{
+    Plotly.newPlot(div, [], {{
+      margin: {{l: 20, r: 10, t: 30, b: 30}},
+      title: "Similarity heatmap: unavailable",
+      annotations: [{{
+        text: h.reason || "Unavailable",
+        xref: "paper", yref: "paper", x: 0.5, y: 0.5, showarrow: false
+      }}]
+    }}, {{displayModeBar: false}});
+    return;
+  }}
 
-  Plotly.newPlot("barDiv", [trace], layout, {{displayModeBar: true}});
-  document.getElementById("rightTitle").textContent = "Per-file counts";
+  Plotly.newPlot(div, h.traces, h.layout, {{displayModeBar: true}});
 }}
 
 function selectToken(tokenId, updateSelect) {{
   if (updateSelect) {{
-    const sel = document.getElementById("tokenSelect");
-    sel.value = String(tokenId);
+    document.getElementById("tokenSelect").value = String(tokenId);
   }}
   renderBar(tokenId);
 }}
@@ -321,11 +500,12 @@ def build_html(
 function init() {{
   buildSelectOptions(DATA.tokens);
 
-  // Default selection
   const sel = document.getElementById("tokenSelect");
   sel.value = String(DATA.default_token_id);
+
   renderTable(DATA.tokens);
   renderBar(DATA.default_token_id);
+  renderHeatmap();
 
   sel.addEventListener("change", (e) => {{
     selectToken(e.target.value, false);
@@ -336,19 +516,14 @@ def build_html(
     const q = e.target.value || "";
     const filtered = filterTokens(DATA.tokens, q);
 
-    // update dropdown to filtered list, but keep current selection if still present
     const cur = document.getElementById("tokenSelect").value;
     buildSelectOptions(filtered);
 
-    // if current selection is in filtered list, keep it; else select first
     const hasCur = filtered.some(t => String(t.id) === String(cur));
     const newId = hasCur ? cur : (filtered.length ? String(filtered[0].id) : String(DATA.default_token_id));
     document.getElementById("tokenSelect").value = newId;
 
-    // rerender table with filtered tokens
     renderTable(filtered);
-
-    // update bar based on dropdown selection
     selectToken(newId, false);
   }});
 }}
@@ -373,12 +548,22 @@ def main() -> None:
                     help="Recurse into subdirectories (default: false).")
     ap.add_argument("--suffixes", default=".txt",
                     help="Comma-separated suffixes to include (default: .txt). Example: .txt,.md")
+
     ap.add_argument("--top-k", type=int, default=1500,
-                    help="Embed only top-K tokens by total frequency into the HTML for interactivity (default: 1500).")
-    ap.add_argument("--out", default="vocab_freq_dashboard.html",
-                    help="Output HTML path (default: vocab_freq_dashboard.html)")
+                    help="Embed only top-K tokens by total frequency into the HTML (default: 1500).")
     ap.add_argument("--min-count", type=int, default=1,
                     help="Only consider tokens with total count >= this (default: 1).")
+    ap.add_argument("--out", default="vocab_freq_dashboard.html",
+                    help="Output HTML path (default: vocab_freq_dashboard.html)")
+
+    # NEW: heatmap controls (no dendro mode; just heatmap)
+    ap.add_argument("--heatmap", action="store_true",
+                    help="Compute and embed a file similarity heatmap (requires numpy).")
+    ap.add_argument("--heatmap-top-k", type=int, default=300,
+                    help="Use top-K frequent tokens (from the directory) as features for TF-IDF similarity (default: 300).")
+    ap.add_argument("--heatmap-reorder", action="store_true",
+                    help="Reorder files to group similar ones (simple greedy heuristic, no SciPy).")
+
     args = ap.parse_args()
 
     vocab_path = Path(args.vocab)
@@ -397,10 +582,9 @@ def main() -> None:
         raise SystemExit(f"No files found in {root} with suffixes={suffixes} (try --recursive or --suffixes)")
 
     sp = spm.SentencePieceProcessor(model_file=str(model_path))
-    vocab_size = sp.get_piece_size()
 
     print(f"[info] model: {model_path}")
-    print(f"[info] vocab size: {vocab_size}")
+    print(f"[info] vocab size: {sp.get_piece_size()}")
     print(f"[info] scanning {len(files)} files under: {root}")
 
     total = Counter()
@@ -414,24 +598,21 @@ def main() -> None:
         per_file[rel] = c
         total.update(c)
 
-    # Build top tokens (by total frequency)
+    # Token UI: top tokens by directory frequency
     items = [(tid, cnt) for tid, cnt in total.items() if cnt >= args.min_count]
     items.sort(key=lambda x: x[1], reverse=True)
-
     if not items:
         raise SystemExit(f"No tokens met min-count={args.min_count} (unexpected).")
 
     top_items = items[: max(1, args.top_k)]
 
     token_rows: List[Dict] = []
-    # per_file_counts: token_id(str) -> file -> count  (only for embedded tokens)
     per_file_counts: Dict[str, Dict[str, int]] = {}
 
     for tid, cnt in top_items:
         tok = human_token(sp.id_to_piece(int(tid)))
         token_rows.append({"id": int(tid), "token": tok, "count": int(cnt)})
 
-    # Build per-file map for embedded tokens
     for tid, _ in top_items:
         tid = int(tid)
         k = str(tid)
@@ -443,6 +624,22 @@ def main() -> None:
 
     default_token_id = int(top_items[0][0])
 
+    heatmap_payload: Optional[Dict] = None
+    if args.heatmap:
+        # Features for similarity
+        feat_tok_ids = [int(tid) for tid, _ in items[: max(2, args.heatmap_top_k)]]
+        print(f"[info] heatmap features: top {len(feat_tok_ids)} tokens (TF-IDF)")
+        try:
+            heatmap_payload = _build_heatmap_payload(
+                file_names=file_names,
+                per_file=per_file,
+                token_ids_for_heatmap=feat_tok_ids,
+                reorder=args.heatmap_reorder,
+            )
+        except Exception as e:
+            heatmap_payload = {"ok": False, "reason": f"Failed to build heatmap: {e!r}"}
+            print(f"[warn] heatmap failed: {e!r}")
+
     title = f"SentencePiece token frequency dashboard ({root.name})"
     build_html(
         title=title,
@@ -450,11 +647,14 @@ def main() -> None:
         per_file_counts=per_file_counts,
         file_order=file_names,
         default_token_id=default_token_id,
+        heatmap_payload=heatmap_payload,
         out_path=out,
     )
 
     print(f"[done] wrote: {out}")
-    print(f"[note] Uses Plotly CDN for interactivity; open the HTML in your browser.")
+    print("[note] Uses Plotly CDN for interactivity; open the HTML in your browser.")
+    if args.heatmap and (np is None):
+        print("[note] Install heatmap deps: python3 -m pip install numpy")
 
 
 if __name__ == "__main__":