diff --git a/data/flores200-res/.gitignore b/data/flores200-res/.gitignore
new file mode 100644
index 0000000000..e33609d251
--- /dev/null
+++ b/data/flores200-res/.gitignore
@@ -0,0 +1 @@
+*.png
diff --git a/data/flores200-res/README.md b/data/flores200-res/README.md
new file mode 100644
index 0000000000..aab97633c7
--- /dev/null
+++ b/data/flores200-res/README.md
@@ -0,0 +1,20 @@
+# Scripts compatible with Flores-200 Restructured
+
+This is a folder with scripts compatible with the Flores-200 project, originally
+from:
+https://github.com/facebookresearch/flores/blob/main/README.md
+
+Though scripts target the restructured format proposed by muhammadravi251001:
+https://huggingface.co/datasets/muhammadravi251001/restructured-flores200
+
+# License of dataset
+
+The Flores 200 dataset is licensed under CC-By-SA 4.0.
+
+## Language Codes
+
+Language Codes here for Flore-200:
+https://github.com/facebookresearch/flores/blob/main/flores200/README.md
+
+Language Codes here for espeak (basis of many of the phoneticizers):
+https://espeak.sourceforge.net/languages.html
diff --git a/data/flores200-res/eng_stats.json b/data/flores200-res/eng_stats.json
new file mode 100644
index 0000000000..abcded9fae
--- /dev/null
+++ b/data/flores200-res/eng_stats.json
@@ -0,0 +1,7 @@
+{
+  "transcribed_bytes": 208634,
+  "not_transcribed_bytes": 7905,
+  "total_bytes": 216539,
+  "pct_transcribed": 96.34938740827288,
+  "pct_not_transcribed": 3.6506125917271253
+}
\ No newline at end of file
diff --git a/data/flores200-res/filter_files_by_script.py b/data/flores200-res/filter_files_by_script.py
new file mode 100644
index 0000000000..3923918ddf
--- /dev/null
+++ b/data/flores200-res/filter_files_by_script.py
@@ -0,0 +1,89 @@
+
+#!/usr/bin/env python3
+"""
+filter_files_by_script.py
+
+Read files.json and emit a simplified JSON with only fields
+relevant to script/language analysis.
+
+Keeps:
+  - language (ISO 639-3)
+  - script (ISO 15924)
+  - lang_script (language_script)
+  - size_kb (float)
+  - filename (optional but useful)
+"""
+
+import json
+import re
+import argparse
+
+FNAME_RE = re.compile(r"^text_([a-z]{3})_([A-Za-z]{4})\.txt$")
+
+
+def parse_size_to_kb(size_str: str) -> float:
+    """
+    Convert ls -h style sizes to KB.
+    """
+    m = re.match(r"^\s*([0-9]*\.?[0-9]+)\s*([KMGTP]?)(B?)\s*$", size_str, re.IGNORECASE)
+    if not m:
+        raise ValueError(f"Unrecognized size string: {size_str!r}")
+
+    val = float(m.group(1))
+    unit = m.group(2).upper()
+
+    mult = {
+        "": 1.0 / 1024.0,  # bytes -> KB
+        "K": 1.0,
+        "M": 1024.0,
+        "G": 1024.0**2,
+        "T": 1024.0**3,
+        "P": 1024.0**4,
+    }[unit]
+
+    return val * mult
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--json", default="files.json", help="Input files.json")
+    ap.add_argument("--out", default="filtered_scripts.json", help="Output JSON")
+    ap.add_argument("--drop-filename", action="store_true",
+                    help="Do not include original filename in output")
+    args = ap.parse_args()
+
+    with open(args.json, "r", encoding="utf-8") as f:
+        rows = json.load(f)
+
+    filtered = []
+
+    for r in rows:
+        name = r.get("name", "")
+        m = FNAME_RE.match(name)
+        if not m:
+            continue
+
+        lang, script = m.groups()
+        size_kb = parse_size_to_kb(str(r["size"]))
+
+        entry = {
+            "language": lang,
+            "script": script,
+            "lang_script": f"{lang}_{script}",
+            "size_kb": size_kb,
+        }
+
+        if not args.drop_filename:
+            entry["filename"] = name
+
+        filtered.append(entry)
+
+    with open(args.out, "w", encoding="utf-8") as f:
+        json.dump(filtered, f, indent=2, ensure_ascii=False)
+
+    print(f"Wrote {len(filtered)} entries to {args.out}")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/data/flores200-res/filtered_files.json b/data/flores200-res/filtered_files.json
new file mode 100644
index 0000000000..c79266cebf
--- /dev/null
+++ b/data/flores200-res/filtered_files.json
@@ -0,0 +1,1512 @@
+[
+  {
+    "language": "yue",
+    "script": "Hant",
+    "lang_script": "yue_Hant",
+    "size_kb": 221.0,
+    "filename": "text_yue_Hant.txt",
+    "tokenized_sizes": {
+      "tiktoken": 318.50390625
+    }
+  },
+  {
+    "language": "zho",
+    "script": "Hans",
+    "lang_script": "zho_Hans",
+    "size_kb": 235.0,
+    "filename": "text_zho_Hans.txt",
+    "tokenized_sizes": {
+      "tiktoken": 331.03125
+    }
+  },
+  {
+    "language": "ace",
+    "script": "Arab",
+    "lang_script": "ace_Arab",
+    "size_kb": 383.0,
+    "filename": "text_ace_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 492.009765625
+    }
+  },
+  {
+    "language": "ace",
+    "script": "Latn",
+    "lang_script": "ace_Latn",
+    "size_kb": 277.0,
+    "filename": "text_ace_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 226.68359375
+    }
+  },
+  {
+    "language": "acm",
+    "script": "Arab",
+    "lang_script": "acm_Arab",
+    "size_kb": 396.0,
+    "filename": "text_acm_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 439.6796875
+    }
+  },
+  {
+    "language": "acq",
+    "script": "Arab",
+    "lang_script": "acq_Arab",
+    "size_kb": 400.0,
+    "filename": "text_acq_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 447.302734375
+    }
+  },
+  {
+    "language": "aeb",
+    "script": "Arab",
+    "lang_script": "aeb_Arab",
+    "size_kb": 390.0,
+    "filename": "text_aeb_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 433.5234375
+    }
+  },
+  {
+    "language": "afr",
+    "script": "Latn",
+    "lang_script": "afr_Latn",
+    "size_kb": 272.0,
+    "filename": "text_afr_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 203.251953125
+    }
+  },
+  {
+    "language": "ajp",
+    "script": "Arab",
+    "lang_script": "ajp_Arab",
+    "size_kb": 377.0,
+    "filename": "text_ajp_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 415.3125
+    }
+  },
+  {
+    "language": "aka",
+    "script": "Latn",
+    "lang_script": "aka_Latn",
+    "size_kb": 279.0,
+    "filename": "text_aka_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 291.162109375
+    }
+  },
+  {
+    "language": "als",
+    "script": "Latn",
+    "lang_script": "als_Latn",
+    "size_kb": 306.0,
+    "filename": "text_als_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 276.4609375
+    }
+  },
+  {
+    "language": "amh",
+    "script": "Ethi",
+    "lang_script": "amh_Ethi",
+    "size_kb": 436.0,
+    "filename": "text_amh_Ethi.txt",
+    "tokenized_sizes": {
+      "tiktoken": 803.419921875
+    }
+  },
+  {
+    "language": "apc",
+    "script": "Arab",
+    "lang_script": "apc_Arab",
+    "size_kb": 376.0,
+    "filename": "text_apc_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 416.90625
+    }
+  },
+  {
+    "language": "arb",
+    "script": "Arab",
+    "lang_script": "arb_Arab",
+    "size_kb": 405.0,
+    "filename": "text_arb_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 453.7265625
+    }
+  },
+  {
+    "language": "arb",
+    "script": "Latn",
+    "lang_script": "arb_Latn",
+    "size_kb": 296.0,
+    "filename": "text_arb_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 262.181640625
+    }
+  },
+  {
+    "language": "ars",
+    "script": "Arab",
+    "lang_script": "ars_Arab",
+    "size_kb": 405.0,
+    "filename": "text_ars_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 454.37890625
+    }
+  },
+  {
+    "language": "ary",
+    "script": "Arab",
+    "lang_script": "ary_Arab",
+    "size_kb": 395.0,
+    "filename": "text_ary_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 434.587890625
+    }
+  },
+  {
+    "language": "arz",
+    "script": "Arab",
+    "lang_script": "arz_Arab",
+    "size_kb": 396.0,
+    "filename": "text_arz_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 437.30078125
+    }
+  },
+  {
+    "language": "asm",
+    "script": "Beng",
+    "lang_script": "asm_Beng",
+    "size_kb": 644.0,
+    "filename": "text_asm_Beng.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1005.07421875
+    }
+  },
+  {
+    "language": "ast",
+    "script": "Latn",
+    "lang_script": "ast_Latn",
+    "size_kb": 270.0,
+    "filename": "text_ast_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 197.443359375
+    }
+  },
+  {
+    "language": "awa",
+    "script": "Deva",
+    "lang_script": "awa_Deva",
+    "size_kb": 634.0,
+    "filename": "text_awa_Deva.txt",
+    "tokenized_sizes": {
+      "tiktoken": 742.212890625
+    }
+  },
+  {
+    "language": "ayr",
+    "script": "Latn",
+    "lang_script": "ayr_Latn",
+    "size_kb": 272.0,
+    "filename": "text_ayr_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 243.201171875
+    }
+  },
+  {
+    "language": "azb",
+    "script": "Arab",
+    "lang_script": "azb_Arab",
+    "size_kb": 412.0,
+    "filename": "text_azb_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 531.052734375
+    }
+  },
+  {
+    "language": "azj",
+    "script": "Latn",
+    "lang_script": "azj_Latn",
+    "size_kb": 320.0,
+    "filename": "text_azj_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 360.34765625
+    }
+  },
+  {
+    "language": "bak",
+    "script": "Cyrl",
+    "lang_script": "bak_Cyrl",
+    "size_kb": 470.0,
+    "filename": "text_bak_Cyrl.txt",
+    "tokenized_sizes": {
+      "tiktoken": 620.775390625
+    }
+  },
+  {
+    "language": "bam",
+    "script": "Latn",
+    "lang_script": "bam_Latn",
+    "size_kb": 265.0,
+    "filename": "text_bam_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 277.44921875
+    }
+  },
+  {
+    "language": "ban",
+    "script": "Latn",
+    "lang_script": "ban_Latn",
+    "size_kb": 282.0,
+    "filename": "text_ban_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 207.21875
+    }
+  },
+  {
+    "language": "bel",
+    "script": "Cyrl",
+    "lang_script": "bel_Cyrl",
+    "size_kb": 523.0,
+    "filename": "text_bel_Cyrl.txt",
+    "tokenized_sizes": {
+      "tiktoken": 677.169921875
+    }
+  },
+  {
+    "language": "bem",
+    "script": "Latn",
+    "lang_script": "bem_Latn",
+    "size_kb": 312.0,
+    "filename": "text_bem_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 257.189453125
+    }
+  },
+  {
+    "language": "ben",
+    "script": "Beng",
+    "lang_script": "ben_Beng",
+    "size_kb": 661.0,
+    "filename": "text_ben_Beng.txt",
+    "tokenized_sizes": {
+      "tiktoken": 991.65625
+    }
+  },
+  {
+    "language": "bho",
+    "script": "Deva",
+    "lang_script": "bho_Deva",
+    "size_kb": 626.0,
+    "filename": "text_bho_Deva.txt",
+    "tokenized_sizes": {
+      "tiktoken": 740.771484375
+    }
+  },
+  {
+    "language": "bjn",
+    "script": "Arab",
+    "lang_script": "bjn_Arab",
+    "size_kb": 428.0,
+    "filename": "text_bjn_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 517.115234375
+    }
+  },
+  {
+    "language": "bjn",
+    "script": "Latn",
+    "lang_script": "bjn_Latn",
+    "size_kb": 267.0,
+    "filename": "text_bjn_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 208.12109375
+    }
+  },
+  {
+    "language": "bod",
+    "script": "Tibt",
+    "lang_script": "bod_Tibt",
+    "size_kb": 840.0,
+    "filename": "text_bod_Tibt.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1532.25
+    }
+  },
+  {
+    "language": "bos",
+    "script": "Latn",
+    "lang_script": "bos_Latn",
+    "size_kb": 262.0,
+    "filename": "text_bos_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 229.4296875
+    }
+  },
+  {
+    "language": "bug",
+    "script": "Latn",
+    "lang_script": "bug_Latn",
+    "size_kb": 278.0,
+    "filename": "text_bug_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 230.97265625
+    }
+  },
+  {
+    "language": "bul",
+    "script": "Cyrl",
+    "lang_script": "bul_Cyrl",
+    "size_kb": 479.0,
+    "filename": "text_bul_Cyrl.txt",
+    "tokenized_sizes": {
+      "tiktoken": 569.134765625
+    }
+  },
+  {
+    "language": "cat",
+    "script": "Latn",
+    "lang_script": "cat_Latn",
+    "size_kb": 285.0,
+    "filename": "text_cat_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 201.0390625
+    }
+  },
+  {
+    "language": "ceb",
+    "script": "Latn",
+    "lang_script": "ceb_Latn",
+    "size_kb": 304.0,
+    "filename": "text_ceb_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 234.4296875
+    }
+  },
+  {
+    "language": "ces",
+    "script": "Latn",
+    "lang_script": "ces_Latn",
+    "size_kb": 274.0,
+    "filename": "text_ces_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 273.87109375
+    }
+  },
+  {
+    "language": "cjk",
+    "script": "Latn",
+    "lang_script": "cjk_Latn",
+    "size_kb": 272.0,
+    "filename": "text_cjk_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 226.83984375
+    }
+  },
+  {
+    "language": "ckb",
+    "script": "Arab",
+    "lang_script": "ckb_Arab",
+    "size_kb": 287.0,
+    "filename": "text_ckb_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 423.845703125
+    }
+  },
+  {
+    "language": "crh",
+    "script": "Latn",
+    "lang_script": "crh_Latn",
+    "size_kb": 285.0,
+    "filename": "text_crh_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 258.728515625
+    }
+  },
+  {
+    "language": "cym",
+    "script": "Latn",
+    "lang_script": "cym_Latn",
+    "size_kb": 272.0,
+    "filename": "text_cym_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 246.384765625
+    }
+  },
+  {
+    "language": "dan",
+    "script": "Latn",
+    "lang_script": "dan_Latn",
+    "size_kb": 266.0,
+    "filename": "text_dan_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 198.712890625
+    }
+  },
+  {
+    "language": "deu",
+    "script": "Latn",
+    "lang_script": "deu_Latn",
+    "size_kb": 301.0,
+    "filename": "text_deu_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 224.591796875
+    }
+  },
+  {
+    "language": "dik",
+    "script": "Latn",
+    "lang_script": "dik_Latn",
+    "size_kb": 245.0,
+    "filename": "text_dik_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 258.310546875
+    }
+  },
+  {
+    "language": "dyu",
+    "script": "Latn",
+    "lang_script": "dyu_Latn",
+    "size_kb": 272.0,
+    "filename": "text_dyu_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 230.681640625
+    }
+  },
+  {
+    "language": "dzo",
+    "script": "Tibt",
+    "lang_script": "dzo_Tibt",
+    "size_kb": 921.0,
+    "filename": "text_dzo_Tibt.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1679.669921875
+    }
+  },
+  {
+    "language": "ell",
+    "script": "Grek",
+    "lang_script": "ell_Grek",
+    "size_kb": 550.0,
+    "filename": "text_ell_Grek.txt",
+    "tokenized_sizes": {
+      "tiktoken": 674.869140625
+    }
+  },
+  {
+    "language": "eng",
+    "script": "Latn",
+    "lang_script": "eng_Latn",
+    "size_kb": 254.0,
+    "filename": "text_eng_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 107.015625
+    }
+  },
+  {
+    "language": "epo",
+    "script": "Latn",
+    "lang_script": "epo_Latn",
+    "size_kb": 258.0,
+    "filename": "text_epo_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 212.609375
+    }
+  },
+  {
+    "language": "est",
+    "script": "Latn",
+    "lang_script": "est_Latn",
+    "size_kb": 257.0,
+    "filename": "text_est_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 221.576171875
+    }
+  },
+  {
+    "language": "eus",
+    "script": "Latn",
+    "lang_script": "eus_Latn",
+    "size_kb": 270.0,
+    "filename": "text_eus_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 220.109375
+    }
+  },
+  {
+    "language": "ewe",
+    "script": "Latn",
+    "lang_script": "ewe_Latn",
+    "size_kb": 271.0,
+    "filename": "text_ewe_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 301.962890625
+    }
+  },
+  {
+    "language": "fao",
+    "script": "Latn",
+    "lang_script": "fao_Latn",
+    "size_kb": 278.0,
+    "filename": "text_fao_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 249.02734375
+    }
+  },
+  {
+    "language": "fij",
+    "script": "Latn",
+    "lang_script": "fij_Latn",
+    "size_kb": 297.0,
+    "filename": "text_fij_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 240.689453125
+    }
+  },
+  {
+    "language": "fin",
+    "script": "Latn",
+    "lang_script": "fin_Latn",
+    "size_kb": 281.0,
+    "filename": "text_fin_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 238.443359375
+    }
+  },
+  {
+    "language": "fon",
+    "script": "Latn",
+    "lang_script": "fon_Latn",
+    "size_kb": 320.0,
+    "filename": "text_fon_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 422.541015625
+    }
+  },
+  {
+    "language": "fra",
+    "script": "Latn",
+    "lang_script": "fra_Latn",
+    "size_kb": 313.0,
+    "filename": "text_fra_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 209.17578125
+    }
+  },
+  {
+    "language": "fur",
+    "script": "Latn",
+    "lang_script": "fur_Latn",
+    "size_kb": 287.0,
+    "filename": "text_fur_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 216.59375
+    }
+  },
+  {
+    "language": "fuv",
+    "script": "Latn",
+    "lang_script": "fuv_Latn",
+    "size_kb": 243.0,
+    "filename": "text_fuv_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 208.873046875
+    }
+  },
+  {
+    "language": "gaz",
+    "script": "Latn",
+    "lang_script": "gaz_Latn",
+    "size_kb": 305.0,
+    "filename": "text_gaz_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 264.150390625
+    }
+  },
+  {
+    "language": "gla",
+    "script": "Latn",
+    "lang_script": "gla_Latn",
+    "size_kb": 325.0,
+    "filename": "text_gla_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 281.35546875
+    }
+  },
+  {
+    "language": "gle",
+    "script": "Latn",
+    "lang_script": "gle_Latn",
+    "size_kb": 312.0,
+    "filename": "text_gle_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 266.9453125
+    }
+  },
+  {
+    "language": "glg",
+    "script": "Latn",
+    "lang_script": "glg_Latn",
+    "size_kb": 287.0,
+    "filename": "text_glg_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 200.67578125
+    }
+  },
+  {
+    "language": "grn",
+    "script": "Latn",
+    "lang_script": "grn_Latn",
+    "size_kb": 275.0,
+    "filename": "text_grn_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 256.6171875
+    }
+  },
+  {
+    "language": "guj",
+    "script": "Gujr",
+    "lang_script": "guj_Gujr",
+    "size_kb": 633.0,
+    "filename": "text_guj_Gujr.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1259.890625
+    }
+  },
+  {
+    "language": "hat",
+    "script": "Latn",
+    "lang_script": "hat_Latn",
+    "size_kb": 240.0,
+    "filename": "text_hat_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 200.294921875
+    }
+  },
+  {
+    "language": "hau",
+    "script": "Latn",
+    "lang_script": "hau_Latn",
+    "size_kb": 274.0,
+    "filename": "text_hau_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 225.41015625
+    }
+  },
+  {
+    "language": "heb",
+    "script": "Hebr",
+    "lang_script": "heb_Hebr",
+    "size_kb": 352.0,
+    "filename": "text_heb_Hebr.txt",
+    "tokenized_sizes": {
+      "tiktoken": 452.626953125
+    }
+  },
+  {
+    "language": "hin",
+    "script": "Deva",
+    "lang_script": "hin_Deva",
+    "size_kb": 646.0,
+    "filename": "text_hin_Deva.txt",
+    "tokenized_sizes": {
+      "tiktoken": 769.654296875
+    }
+  },
+  {
+    "language": "hne",
+    "script": "Deva",
+    "lang_script": "hne_Deva",
+    "size_kb": 624.0,
+    "filename": "text_hne_Deva.txt",
+    "tokenized_sizes": {
+      "tiktoken": 743.744140625
+    }
+  },
+  {
+    "language": "hrv",
+    "script": "Latn",
+    "lang_script": "hrv_Latn",
+    "size_kb": 256.0,
+    "filename": "text_hrv_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 224.17578125
+    }
+  },
+  {
+    "language": "hun",
+    "script": "Latn",
+    "lang_script": "hun_Latn",
+    "size_kb": 293.0,
+    "filename": "text_hun_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 277.033203125
+    }
+  },
+  {
+    "language": "hye",
+    "script": "Armn",
+    "lang_script": "hye_Armn",
+    "size_kb": 518.0,
+    "filename": "text_hye_Armn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1028.42578125
+    }
+  },
+  {
+    "language": "ibo",
+    "script": "Latn",
+    "lang_script": "ibo_Latn",
+    "size_kb": 306.0,
+    "filename": "text_ibo_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 355.85546875
+    }
+  },
+  {
+    "language": "ilo",
+    "script": "Latn",
+    "lang_script": "ilo_Latn",
+    "size_kb": 307.0,
+    "filename": "text_ilo_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 236.40234375
+    }
+  },
+  {
+    "language": "ind",
+    "script": "Latn",
+    "lang_script": "ind_Latn",
+    "size_kb": 275.0,
+    "filename": "text_ind_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 207.615234375
+    }
+  },
+  {
+    "language": "isl",
+    "script": "Latn",
+    "lang_script": "isl_Latn",
+    "size_kb": 277.0,
+    "filename": "text_isl_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 253.580078125
+    }
+  },
+  {
+    "language": "ita",
+    "script": "Latn",
+    "lang_script": "ita_Latn",
+    "size_kb": 301.0,
+    "filename": "text_ita_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 210.482421875
+    }
+  },
+  {
+    "language": "jav",
+    "script": "Latn",
+    "lang_script": "jav_Latn",
+    "size_kb": 264.0,
+    "filename": "text_jav_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 203.388671875
+    }
+  },
+  {
+    "language": "jpn",
+    "script": "Jpan",
+    "lang_script": "jpn_Jpan",
+    "size_kb": 322.0,
+    "filename": "text_jpn_Jpan.txt",
+    "tokenized_sizes": {
+      "tiktoken": 309.5625
+    }
+  },
+  {
+    "language": "kab",
+    "script": "Latn",
+    "lang_script": "kab_Latn",
+    "size_kb": 268.0,
+    "filename": "text_kab_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 260.916015625
+    }
+  },
+  {
+    "language": "kac",
+    "script": "Latn",
+    "lang_script": "kac_Latn",
+    "size_kb": 322.0,
+    "filename": "text_kac_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 275.46875
+    }
+  },
+  {
+    "language": "kam",
+    "script": "Latn",
+    "lang_script": "kam_Latn",
+    "size_kb": 258.0,
+    "filename": "text_kam_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 242.33984375
+    }
+  },
+  {
+    "language": "kan",
+    "script": "Knda",
+    "lang_script": "kan_Knda",
+    "size_kb": 718.0,
+    "filename": "text_kan_Knda.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1404.755859375
+    }
+  },
+  {
+    "language": "kas",
+    "script": "Arab",
+    "lang_script": "kas_Arab",
+    "size_kb": 437.0,
+    "filename": "text_kas_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 636.080078125
+    }
+  },
+  {
+    "language": "kas",
+    "script": "Deva",
+    "lang_script": "kas_Deva",
+    "size_kb": 608.0,
+    "filename": "text_kas_Deva.txt",
+    "tokenized_sizes": {
+      "tiktoken": 725.751953125
+    }
+  },
+  {
+    "language": "kat",
+    "script": "Geor",
+    "lang_script": "kat_Geor",
+    "size_kb": 747.0,
+    "filename": "text_kat_Geor.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1425.55078125
+    }
+  },
+  {
+    "language": "kaz",
+    "script": "Cyrl",
+    "lang_script": "kaz_Cyrl",
+    "size_kb": 478.0,
+    "filename": "text_kaz_Cyrl.txt",
+    "tokenized_sizes": {
+      "tiktoken": 611.3046875
+    }
+  },
+  {
+    "language": "kbp",
+    "script": "Latn",
+    "lang_script": "kbp_Latn",
+    "size_kb": 348.0,
+    "filename": "text_kbp_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 503.822265625
+    }
+  },
+  {
+    "language": "kea",
+    "script": "Latn",
+    "lang_script": "kea_Latn",
+    "size_kb": 258.0,
+    "filename": "text_kea_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 203.4453125
+    }
+  },
+  {
+    "language": "khk",
+    "script": "Cyrl",
+    "lang_script": "khk_Cyrl",
+    "size_kb": 485.0,
+    "filename": "text_khk_Cyrl.txt",
+    "tokenized_sizes": {
+      "tiktoken": 662.623046875
+    }
+  },
+  {
+    "language": "khm",
+    "script": "Khmr",
+    "lang_script": "khm_Khmr",
+    "size_kb": 845.0,
+    "filename": "text_khm_Khmr.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1578.11328125
+    }
+  },
+  {
+    "language": "kik",
+    "script": "Latn",
+    "lang_script": "kik_Latn",
+    "size_kb": 329.0,
+    "filename": "text_kik_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 357.6328125
+    }
+  },
+  {
+    "language": "kin",
+    "script": "Latn",
+    "lang_script": "kin_Latn",
+    "size_kb": 288.0,
+    "filename": "text_kin_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 247.646484375
+    }
+  },
+  {
+    "language": "kir",
+    "script": "Cyrl",
+    "lang_script": "kir_Cyrl",
+    "size_kb": 477.0,
+    "filename": "text_kir_Cyrl.txt",
+    "tokenized_sizes": {
+      "tiktoken": 592.859375
+    }
+  },
+  {
+    "language": "kmb",
+    "script": "Latn",
+    "lang_script": "kmb_Latn",
+    "size_kb": 282.0,
+    "filename": "text_kmb_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 244.283203125
+    }
+  },
+  {
+    "language": "kmr",
+    "script": "Latn",
+    "lang_script": "kmr_Latn",
+    "size_kb": 279.0,
+    "filename": "text_kmr_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 256.443359375
+    }
+  },
+  {
+    "language": "knc",
+    "script": "Arab",
+    "lang_script": "knc_Arab",
+    "size_kb": 405.0,
+    "filename": "text_knc_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 488.65234375
+    }
+  },
+  {
+    "language": "knc",
+    "script": "Latn",
+    "lang_script": "knc_Latn",
+    "size_kb": 282.0,
+    "filename": "text_knc_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 269.248046875
+    }
+  },
+  {
+    "language": "kon",
+    "script": "Latn",
+    "lang_script": "kon_Latn",
+    "size_kb": 289.0,
+    "filename": "text_kon_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 227.642578125
+    }
+  },
+  {
+    "language": "kor",
+    "script": "Hang",
+    "lang_script": "kor_Hang",
+    "size_kb": 304.0,
+    "filename": "text_kor_Hang.txt",
+    "tokenized_sizes": {
+      "tiktoken": 523.220703125
+    }
+  },
+  {
+    "language": "lao",
+    "script": "Laoo",
+    "lang_script": "lao_Laoo",
+    "size_kb": 692.0,
+    "filename": "text_lao_Laoo.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1354.791015625
+    }
+  },
+  {
+    "language": "lij",
+    "script": "Latn",
+    "lang_script": "lij_Latn",
+    "size_kb": 296.0,
+    "filename": "text_lij_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 239.205078125
+    }
+  },
+  {
+    "language": "lim",
+    "script": "Latn",
+    "lang_script": "lim_Latn",
+    "size_kb": 272.0,
+    "filename": "text_lim_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 214.634765625
+    }
+  },
+  {
+    "language": "lin",
+    "script": "Latn",
+    "lang_script": "lin_Latn",
+    "size_kb": 274.0,
+    "filename": "text_lin_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 212.880859375
+    }
+  },
+  {
+    "language": "lit",
+    "script": "Latn",
+    "lang_script": "lit_Latn",
+    "size_kb": 270.0,
+    "filename": "text_lit_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 256.07421875
+    }
+  },
+  {
+    "language": "lmo",
+    "script": "Latn",
+    "lang_script": "lmo_Latn",
+    "size_kb": 294.0,
+    "filename": "text_lmo_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 247.615234375
+    }
+  },
+  {
+    "language": "ltg",
+    "script": "Latn",
+    "lang_script": "ltg_Latn",
+    "size_kb": 266.0,
+    "filename": "text_ltg_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 250.259765625
+    }
+  },
+  {
+    "language": "ltz",
+    "script": "Latn",
+    "lang_script": "ltz_Latn",
+    "size_kb": 292.0,
+    "filename": "text_ltz_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 235.171875
+    }
+  },
+  {
+    "language": "lua",
+    "script": "Latn",
+    "lang_script": "lua_Latn",
+    "size_kb": 274.0,
+    "filename": "text_lua_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 223.138671875
+    }
+  },
+  {
+    "language": "lug",
+    "script": "Latn",
+    "lang_script": "lug_Latn",
+    "size_kb": 262.0,
+    "filename": "text_lug_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 227.66015625
+    }
+  },
+  {
+    "language": "luo",
+    "script": "Latn",
+    "lang_script": "luo_Latn",
+    "size_kb": 266.0,
+    "filename": "text_luo_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 214.599609375
+    }
+  },
+  {
+    "language": "lus",
+    "script": "Latn",
+    "lang_script": "lus_Latn",
+    "size_kb": 279.0,
+    "filename": "text_lus_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 219.353515625
+    }
+  },
+  {
+    "language": "lvs",
+    "script": "Latn",
+    "lang_script": "lvs_Latn",
+    "size_kb": 283.0,
+    "filename": "text_lvs_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 265.765625
+    }
+  },
+  {
+    "language": "mag",
+    "script": "Deva",
+    "lang_script": "mag_Deva",
+    "size_kb": 625.0,
+    "filename": "text_mag_Deva.txt",
+    "tokenized_sizes": {
+      "tiktoken": 745.328125
+    }
+  },
+  {
+    "language": "mai",
+    "script": "Deva",
+    "lang_script": "mai_Deva",
+    "size_kb": 641.0,
+    "filename": "text_mai_Deva.txt",
+    "tokenized_sizes": {
+      "tiktoken": 767.1484375
+    }
+  },
+  {
+    "language": "mal",
+    "script": "Mlym",
+    "lang_script": "mal_Mlym",
+    "size_kb": 787.0,
+    "filename": "text_mal_Mlym.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1565.658203125
+    }
+  },
+  {
+    "language": "mar",
+    "script": "Deva",
+    "lang_script": "mar_Deva",
+    "size_kb": 677.0,
+    "filename": "text_mar_Deva.txt",
+    "tokenized_sizes": {
+      "tiktoken": 810.67578125
+    }
+  },
+  {
+    "language": "min",
+    "script": "Arab",
+    "lang_script": "min_Arab",
+    "size_kb": 441.0,
+    "filename": "text_min_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 539.71875
+    }
+  },
+  {
+    "language": "min",
+    "script": "Latn",
+    "lang_script": "min_Latn",
+    "size_kb": 271.0,
+    "filename": "text_min_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 206.884765625
+    }
+  },
+  {
+    "language": "mkd",
+    "script": "Cyrl",
+    "lang_script": "mkd_Cyrl",
+    "size_kb": 480.0,
+    "filename": "text_mkd_Cyrl.txt",
+    "tokenized_sizes": {
+      "tiktoken": 564.958984375
+    }
+  },
+  {
+    "language": "mlt",
+    "script": "Latn",
+    "lang_script": "mlt_Latn",
+    "size_kb": 295.0,
+    "filename": "text_mlt_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 281.068359375
+    }
+  },
+  {
+    "language": "mni",
+    "script": "Beng",
+    "lang_script": "mni_Beng",
+    "size_kb": 701.0,
+    "filename": "text_mni_Beng.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1048.951171875
+    }
+  },
+  {
+    "language": "mos",
+    "script": "Latn",
+    "lang_script": "mos_Latn",
+    "size_kb": 262.0,
+    "filename": "text_mos_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 264.98828125
+    }
+  },
+  {
+    "language": "mri",
+    "script": "Latn",
+    "lang_script": "mri_Latn",
+    "size_kb": 294.0,
+    "filename": "text_mri_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 255.35546875
+    }
+  },
+  {
+    "language": "mya",
+    "script": "Mymr",
+    "lang_script": "mya_Mymr",
+    "size_kb": 890.0,
+    "filename": "text_mya_Mymr.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1738.203125
+    }
+  },
+  {
+    "language": "nld",
+    "script": "Latn",
+    "lang_script": "nld_Latn",
+    "size_kb": 283.0,
+    "filename": "text_nld_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 206.50390625
+    }
+  },
+  {
+    "language": "nno",
+    "script": "Latn",
+    "lang_script": "nno_Latn",
+    "size_kb": 263.0,
+    "filename": "text_nno_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 202.712890625
+    }
+  },
+  {
+    "language": "nob",
+    "script": "Latn",
+    "lang_script": "nob_Latn",
+    "size_kb": 261.0,
+    "filename": "text_nob_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 195.431640625
+    }
+  },
+  {
+    "language": "npi",
+    "script": "Deva",
+    "lang_script": "npi_Deva",
+    "size_kb": 650.0,
+    "filename": "text_npi_Deva.txt",
+    "tokenized_sizes": {
+      "tiktoken": 782.9453125
+    }
+  },
+  {
+    "language": "nso",
+    "script": "Latn",
+    "lang_script": "nso_Latn",
+    "size_kb": 298.0,
+    "filename": "text_nso_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 242.853515625
+    }
+  },
+  {
+    "language": "nus",
+    "script": "Latn",
+    "lang_script": "nus_Latn",
+    "size_kb": 335.0,
+    "filename": "text_nus_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 436.869140625
+    }
+  },
+  {
+    "language": "nya",
+    "script": "Latn",
+    "lang_script": "nya_Latn",
+    "size_kb": 285.0,
+    "filename": "text_nya_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 237.115234375
+    }
+  },
+  {
+    "language": "oci",
+    "script": "Latn",
+    "lang_script": "oci_Latn",
+    "size_kb": 298.0,
+    "filename": "text_oci_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 216.3203125
+    }
+  },
+  {
+    "language": "ory",
+    "script": "Orya",
+    "lang_script": "ory_Orya",
+    "size_kb": 693.0,
+    "filename": "text_ory_Orya.txt",
+    "tokenized_sizes": {
+      "tiktoken": 1374.439453125
+    }
+  },
+  {
+    "language": "pag",
+    "script": "Latn",
+    "lang_script": "pag_Latn",
+    "size_kb": 253.0,
+    "filename": "text_pag_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 174.919921875
+    }
+  },
+  {
+    "language": "pan",
+    "script": "Guru",
+    "lang_script": "pan_Guru",
+    "size_kb": 657.0,
+    "filename": "text_pan_Guru.txt",
+    "tokenized_sizes": {
+      "tiktoken": 815.029296875
+    }
+  },
+  {
+    "language": "pap",
+    "script": "Latn",
+    "lang_script": "pap_Latn",
+    "size_kb": 274.0,
+    "filename": "text_pap_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 208.033203125
+    }
+  },
+  {
+    "language": "pbt",
+    "script": "Arab",
+    "lang_script": "pbt_Arab",
+    "size_kb": 421.0,
+    "filename": "text_pbt_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 554.59375
+    }
+  },
+  {
+    "language": "pes",
+    "script": "Arab",
+    "lang_script": "pes_Arab",
+    "size_kb": 430.0,
+    "filename": "text_pes_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 547.0625
+    }
+  },
+  {
+    "language": "plt",
+    "script": "Latn",
+    "lang_script": "plt_Latn",
+    "size_kb": 319.0,
+    "filename": "text_plt_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 270.25390625
+    }
+  },
+  {
+    "language": "pol",
+    "script": "Latn",
+    "lang_script": "pol_Latn",
+    "size_kb": 286.0,
+    "filename": "text_pol_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 280.4609375
+    }
+  },
+  {
+    "language": "por",
+    "script": "Latn",
+    "lang_script": "por_Latn",
+    "size_kb": 283.0,
+    "filename": "text_por_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 202.765625
+    }
+  },
+  {
+    "language": "prs",
+    "script": "Arab",
+    "lang_script": "prs_Arab",
+    "size_kb": 413.0,
+    "filename": "text_prs_Arab.txt",
+    "tokenized_sizes": {
+      "tiktoken": 526.087890625
+    }
+  },
+  {
+    "language": "quy",
+    "script": "Latn",
+    "lang_script": "quy_Latn",
+    "size_kb": 273.0,
+    "filename": "text_quy_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 230.93359375
+    }
+  },
+  {
+    "language": "ron",
+    "script": "Latn",
+    "lang_script": "ron_Latn",
+    "size_kb": 301.0,
+    "filename": "text_ron_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 259.185546875
+    }
+  },
+  {
+    "language": "vie",
+    "script": "Latn",
+    "lang_script": "vie_Latn",
+    "size_kb": 360.0,
+    "filename": "text_vie_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 470.21875
+    }
+  },
+  {
+    "language": "swh",
+    "script": "Latn",
+    "lang_script": "swh_Latn",
+    "size_kb": 272.0,
+    "filename": "text_swh_Latn.txt",
+    "tokenized_sizes": {
+      "tiktoken": 223.5703125
+    }
+  }
+]
\ No newline at end of file
diff --git a/data/flores200-res/get_dataset.sh b/data/flores200-res/get_dataset.sh
index deda56df65..db841c7057 100644
--- a/data/flores200-res/get_dataset.sh
+++ b/data/flores200-res/get_dataset.sh
@@ -1,25 +1,227 @@
 #!/bin/bash
 
-### Instructions:
-# 1. Replace "INSERT_URL_WITH_FILES" with the actual URL to the Parquet files.
-# 2. Modify the "include_keys" array to specify the keys you want to include in the output.
-# 3. (Optionally) Modify the "value_prefixes" array to set prefixes for each value, use "" for empty prefixes
-# 4. Set "--skip_empty" to true if you want to skip empty fields, or false if not needed.
-# 5. Set "--no_output_text" to true if you plan to process the intermediate json files in a custom manner.
-# 6. For CSV files with BOM headers, pass "--input_encoding utf-8-sig" to the helper script.
-# 7. For CSV cells that contain multi-line text, use "--split_multiline_values" to emit one line per entry or
-#    "--newline_replacement" to substitute newline characters with custom text.
-
-# Run the Python script with the specified arguments
 
 lang_array=(
   "text_eng_Latn"
   "text_jpn_Jpan"
   "text_kor_Hang"
   "text_zho_Hans"
-  "text_shn_Mymr"
+  "text_yue_Hant"
+  "text_hin_Deva"
+  "text_vie_Latn"
+  "text_ind_Latn"
+  "text_swh_Latn"
+  "text_ell_Grek"
+  "text_fra_Latn"
 )
 
+# lang_array=(
+#   "text_ace_Arab"
+#   "text_ace_Latn"
+#   "text_acm_Arab"
+#   "text_acq_Arab"
+#   "text_aeb_Arab"
+#   "text_afr_Latn"
+#   "text_ajp_Arab"
+#   "text_aka_Latn"
+#   "text_als_Latn"
+#   "text_amh_Ethi"
+#   "text_apc_Arab"
+#   "text_arb_Arab"
+#   "text_arb_Latn"
+#   "text_ars_Arab"
+#   "text_ary_Arab"
+#   "text_arz_Arab"
+#   "text_asm_Beng"
+#   "text_ast_Latn"
+#   "text_awa_Deva"
+#   "text_ayr_Latn"
+#   "text_azb_Arab"
+#   "text_azj_Latn"
+#   "text_bak_Cyrl"
+#   "text_bam_Latn"
+#   "text_ban_Latn"
+#   "text_bel_Cyrl"
+#   "text_bem_Latn"
+#   "text_ben_Beng"
+#   "text_bho_Deva"
+#   "text_bjn_Arab"
+#   "text_bjn_Latn"
+#   "text_bod_Tibt"
+#   "text_bos_Latn"
+#   "text_bug_Latn"
+#   "text_bul_Cyrl"
+#   "text_cat_Latn"
+#   "text_ceb_Latn"
+#   "text_ces_Latn"
+#   "text_cjk_Latn"
+#   "text_ckb_Arab"
+#   "text_crh_Latn"
+#   "text_cym_Latn"
+#   "text_dan_Latn"
+#   "text_deu_Latn"
+#   "text_dik_Latn"
+#   "text_dyu_Latn"
+#   "text_dzo_Tibt"
+#   "text_ell_Grek"
+#   "text_eng_Latn"
+#   "text_epo_Latn"
+#   "text_est_Latn"
+#   "text_eus_Latn"
+#   "text_ewe_Latn"
+#   "text_fao_Latn"
+#   "text_fij_Latn"
+#   "text_fin_Latn"
+#   "text_fon_Latn"
+#   "text_fra_Latn"
+#   "text_fur_Latn"
+#   "text_fuv_Latn"
+#   "text_gaz_Latn"
+#   "text_gla_Latn"
+#   "text_gle_Latn"
+#   "text_glg_Latn"
+#   "text_grn_Latn"
+#   "text_guj_Gujr"
+#   "text_hat_Latn"
+#   "text_hau_Latn"
+#   "text_heb_Hebr"
+#   "text_hin_Deva"
+#   "text_hne_Deva"
+#   "text_hrv_Latn"
+#   "text_hun_Latn"
+#   "text_hye_Armn"
+#   "text_ibo_Latn"
+#   "text_ilo_Latn"
+#   "text_ind_Latn"
+#   "text_isl_Latn"
+#   "text_ita_Latn"
+#   "text_jav_Latn"
+#   "text_jpn_Jpan"
+#   "text_kab_Latn"
+#   "text_kac_Latn"
+#   "text_kam_Latn"
+#   "text_kan_Knda"
+#   "text_kas_Arab"
+#   "text_kas_Deva"
+#   "text_kat_Geor"
+#   "text_kaz_Cyrl"
+#   "text_kbp_Latn"
+#   "text_kea_Latn"
+#   "text_khk_Cyrl"
+#   "text_khm_Khmr"
+#   "text_kik_Latn"
+#   "text_kin_Latn"
+#   "text_kir_Cyrl"
+#   "text_kmb_Latn"
+#   "text_kmr_Latn"
+#   "text_knc_Arab"
+#   "text_knc_Latn"
+#   "text_kon_Latn"
+#   "text_kor_Hang"
+#   "text_lao_Laoo"
+#   "text_lij_Latn"
+#   "text_lim_Latn"
+#   "text_lin_Latn"
+#   "text_lit_Latn"
+#   "text_lmo_Latn"
+#   "text_ltg_Latn"
+#   "text_ltz_Latn"
+#   "text_lua_Latn"
+#   "text_lug_Latn"
+#   "text_luo_Latn"
+#   "text_lus_Latn"
+#   "text_lvs_Latn"
+#   "text_mag_Deva"
+#   "text_mai_Deva"
+#   "text_mal_Mlym"
+#   "text_mar_Deva"
+#   "text_min_Arab"
+#   "text_min_Latn"
+#   "text_mkd_Cyrl"
+#   "text_mlt_Latn"
+#   "text_mni_Beng"
+#   "text_mos_Latn"
+#   "text_mri_Latn"
+#   "text_mya_Mymr"
+#   "text_nld_Latn"
+#   "text_nno_Latn"
+#   "text_nob_Latn"
+#   "text_npi_Deva"
+#   "text_nso_Latn"
+#   "text_nus_Latn"
+#   "text_nya_Latn"
+#   "text_oci_Latn"
+#   "text_ory_Orya"
+#   "text_pag_Latn"
+#   "text_pan_Guru"
+#   "text_pap_Latn"
+#   "text_pbt_Arab"
+#   "text_pes_Arab"
+#   "text_plt_Latn"
+#   "text_pol_Latn"
+#   "text_por_Latn"
+#   "text_prs_Arab"
+#   "text_quy_Latn"
+#   "text_ron_Latn"
+#   "text_run_Latn"
+#   "text_rus_Cyrl"
+#   "text_sag_Latn"
+#   "text_san_Deva"
+#   "text_sat_Olck"
+#   "text_scn_Latn"
+#   "text_shn_Mymr"
+#   "text_sin_Sinh"
+#   "text_slk_Latn"
+#   "text_slv_Latn"
+#   "text_smo_Latn"
+#   "text_sna_Latn"
+#   "text_snd_Arab"
+#   "text_som_Latn"
+#   "text_sot_Latn"
+#   "text_spa_Latn"
+#   "text_srd_Latn"
+#   "text_srp_Cyrl"
+#   "text_ssw_Latn"
+#   "text_sun_Latn"
+#   "text_swe_Latn"
+#   "text_swh_Latn"
+#   "text_szl_Latn"
+#   "text_tam_Taml"
+#   "text_taq_Latn"
+#   "text_taq_Tfng"
+#   "text_tat_Cyrl"
+#   "text_tel_Telu"
+#   "text_tgk_Cyrl"
+#   "text_tgl_Latn"
+#   "text_tha_Thai"
+#   "text_tir_Ethi"
+#   "text_tpi_Latn"
+#   "text_tsn_Latn"
+#   "text_tso_Latn"
+#   "text_tuk_Latn"
+#   "text_tum_Latn"
+#   "text_tur_Latn"
+#   "text_twi_Latn"
+#   "text_tzm_Tfng"
+#   "text_uig_Arab"
+#   "text_ukr_Cyrl"
+#   "text_umb_Latn"
+#   "text_urd_Arab"
+#   "text_uzn_Latn"
+#   "text_vec_Latn"
+#   "text_vie_Latn"
+#   "text_war_Latn"
+#   "text_wol_Latn"
+#   "text_xho_Latn"
+#   "text_ydd_Hebr"
+#   "text_yor_Latn"
+#   "text_yue_Hant"
+#   "text_zho_Hans"
+#   "text_zho_Hant"
+#   "text_zsm_Latn"
+#   "text_zul_Latn"
+# )
+
 # Add url with dataset here:
 url="https://huggingface.co/datasets/muhammadravi251001/restructured-flores200/tree/main/data"
 
@@ -29,5 +231,4 @@ for lang in "${lang_array[@]}"; do
     --include_keys "$lang" \
     --value_prefix $'\n' \
     --output_text_file "$lang".txt
-
 done
diff --git a/data/flores200-res/graphs.sh b/data/flores200-res/graphs.sh
new file mode 100644
index 0000000000..aabf067fda
--- /dev/null
+++ b/data/flores200-res/graphs.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+python3 plot_langscript_sizes_grouped.py --group-by script --color-by script --out by_script.png
+python3 plot_langscript_sizes_grouped.py --group-by script --color-by region --out by_region_script.png
+python3 plot_langscript_sizes_grouped.py --group-by region --color-by region --out by_region.png
+python3 plot_langscript_sizes_grouped.py --group-by family --color-by family --out by_family.png
+python3 plot_langscript_sizes_grouped.py --group-by family --color-by script --out by_family_script.png
+
+
+
diff --git a/data/flores200-res/ipa_scripts.sh b/data/flores200-res/ipa_scripts.sh
new file mode 100644
index 0000000000..ead45ced52
--- /dev/null
+++ b/data/flores200-res/ipa_scripts.sh
@@ -0,0 +1,23 @@
+# include tokenized comparison (uses tokenized_sizes["tiktoken"] from filtered_scripts.json)
+# python3 plot_ipa_vs_text.py \
+#   --text-dir text --ipa-dir ipa \
+#   --filtered-json filtered_files.json \
+#   --tok-method tiktoken
+
+# # save everything to plots_out/
+# python3 plot_ipa_vs_text.py \
+#   --text-dir text --ipa-dir ipa \
+#   --filtered-json filtered_scripts.json \
+#   --tok-method tiktoken \
+#   --save --outdir plots_out --csv
+
+# only keep languages that have tiktoken sizes
+# python3 plot_ipa_vs_text.py \
+#   --text-dir text --ipa-dir ipa \
+#   --filtered-json filtered_tiles.json \
+#   --tok-method tiktoken \
+#   --skip-missing-tok
+
+
+python3 plot_ipa_vs_text.py --text-dir text --ipa-dir ipa --save --outdir plots_out --csv
+
diff --git a/data/flores200-res/ja_stats.json b/data/flores200-res/ja_stats.json
new file mode 100644
index 0000000000..30b83d8af6
--- /dev/null
+++ b/data/flores200-res/ja_stats.json
@@ -0,0 +1,7 @@
+{
+  "transcribed_bytes": 305426,
+  "not_transcribed_bytes": 21596,
+  "total_bytes": 327022,
+  "pct_transcribed": 93.3961629492817,
+  "pct_not_transcribed": 6.6038370507183
+}
\ No newline at end of file
diff --git a/data/flores200-res/ko_stats.json b/data/flores200-res/ko_stats.json
new file mode 100644
index 0000000000..5330f63aca
--- /dev/null
+++ b/data/flores200-res/ko_stats.json
@@ -0,0 +1,7 @@
+{
+  "transcribed_bytes": 271690,
+  "not_transcribed_bytes": 8833,
+  "total_bytes": 280523,
+  "pct_transcribed": 96.85123857936783,
+  "pct_not_transcribed": 3.148761420632176
+}
\ No newline at end of file
diff --git a/data/flores200-res/phoneticize.sh b/data/flores200-res/phoneticize.sh
index 007c034696..2552a922ff 100644
--- a/data/flores200-res/phoneticize.sh
+++ b/data/flores200-res/phoneticize.sh
@@ -1,7 +1,25 @@
 #!/bin/bash
 
 
-python3 utils/en2ipa.py text_eng_Latn.txt --mode text --output_file ipa_text_eng_Latn.txt
-python3 utils/ja2ipa.py text_jpn_Jpan.txt ipa_text_jpn_Jpan.txt --text_output --use_spacy --text_no_sentence
-python3 utils/ko_en_to_ipa.py text_kor_Hang.txt --text_input --text_output ipa_text_kor_Hang.txt
-python3 utils/zh_to_ipa.py text_zho_Hans.txt ipa_text_zho_Hans.txt --input_type text --no-wrapper
+# python3 utils/en2ipa.py text_eng_Latn.txt --mode text --output_file ipa_text_eng_Latn.txt --no-wrapper --stats_json eng_stats.json
+# python3 utils/ja2ipa.py text_jpn_Jpan.txt ipa_text_jpn_Jpan.txt --text_output --use_spacy --text_no_sentence --stats_json ja_stats.json
+# python3 utils/ko_en_to_ipa.py text_kor_Hang.txt --text_input --text_output ipa_text_kor_Hang.txt --stats_json ko_stats.json
+# python3 utils/zh_to_ipa.py text_zho_Hans.txt ipa_text_zho_Hans.txt --input_type text --no-wrapper --stats_json zh_stats.json
+
+lang_array=(
+  "text_vie_Latn:vi"
+  "text_ind_Latn:id"
+  "text_swh_Latn:sw"
+  "text_ell_Grek:el"
+  "text_fra_Latn:fr"
+  "text_yue_Hant:yue"
+)
+
+for lang in "${lang_array[@]}"; do
+  text_file="${lang%%:*}"
+  two_letter_code="${lang##*:}"
+  echo "${text_file}; ${two_letter_code}"
+  if [ ! -f "ipa_${text_file}.txt" ]; then
+    python3 utils/espeak2ipa.py "$text_file".txt --mode text --output_file ipa_"$text_file".txt --no-wrapper --stats_json stats_"$text_file".json --lang "$two_letter_code" --text_no_sentence
+  fi
+done
diff --git a/data/flores200-res/plot_ipa_vs_text.py b/data/flores200-res/plot_ipa_vs_text.py
new file mode 100644
index 0000000000..5c0e246466
--- /dev/null
+++ b/data/flores200-res/plot_ipa_vs_text.py
@@ -0,0 +1,529 @@
+#!/usr/bin/env python3
+"""
+plot_ipa_vs_text.py
+
+Graphs IPA vs raw text sizes for paired files across directories:
+
+  text/<text_*.txt>
+  ipa/<ipa_text_*.txt>
+
+Defaults:
+  --text-dir text/
+  --ipa-dir  ipa/
+
+Tokenization:
+- Can also load tokenized sizes (e.g. tiktoken) from filtered_scripts.json (or other)
+  produced by your tokenize_and_annotate_sizes.py pipeline, and plot:
+
+    raw_bytes vs ipa_bytes vs tok_bytes
+
+Assumptions for filtered JSON rows:
+  - list[dict]
+  - key "lang_script" OR ("language"+"_"+"script") matches the <lang> part of text_<lang>.txt
+  - key "tokenized_sizes" is a dict like {"tiktoken": <KB float>, ...}
+
+Produces (same as before):
+- scatter: IPA bytes vs raw bytes
+- bar: IPA/raw ratio
+- bar: delta bytes (IPA - raw)
+
+Additionally (if filtered json provided & matches are found):
+- grouped bar: Raw vs IPA vs Tokenized (bytes) per language
+
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+from dataclasses import dataclass
+from pathlib import Path
+import math
+import json
+
+from typing import Dict, List, Optional, Tuple
+
+import matplotlib.pyplot as plt
+
+
+@dataclass
+class PairStats:
+    lang: str
+    raw_path: Path
+    ipa_path: Path
+    raw_bytes: int
+    ipa_bytes: int
+    raw_chars: int
+    ipa_chars: int
+    raw_lines: int
+    ipa_lines: int
+    tok_bytes: Optional[int] = None  # NEW
+
+    @property
+    def ratio_bytes(self) -> float:
+        return (self.ipa_bytes / self.raw_bytes) if self.raw_bytes else float("inf")
+
+    @property
+    def delta_bytes(self) -> int:
+        return self.ipa_bytes - self.raw_bytes
+
+
+def read_stats(p: Path) -> Tuple[int, int, int]:
+    """
+    Returns (utf8_bytes, chars, lines).
+    """
+    data = p.read_text(encoding="utf-8", errors="replace")
+    b = len(data.encode("utf-8"))
+    c = len(data)
+    lines = data.count("\n") + (1 if data and not data.endswith("\n") else 0)
+    return b, c, lines
+
+
+def discover_pairs(text_dir: Path, ipa_dir: Path) -> List[Tuple[str, Path, Path]]:
+    """
+    Finds pairs across directories:
+      text_dir/text_<lang>.txt
+      ipa_dir/ipa_text_<lang>.txt
+
+    Note: <lang> can be "eng_Latn" etc; we treat it as an opaque key.
+    """
+    raw_map: Dict[str, Path] = {}
+    ipa_map: Dict[str, Path] = {}
+
+    for p in text_dir.iterdir():
+        if p.is_file() and p.name.startswith("text_") and p.name.endswith(".txt"):
+            lang = p.name[len("text_") : -len(".txt")]
+            raw_map[lang] = p
+
+    for p in ipa_dir.iterdir():
+        if p.is_file() and p.name.startswith("ipa_text_") and p.name.endswith(".txt"):
+            lang = p.name[len("ipa_text_") : -len(".txt")]
+            ipa_map[lang] = p
+
+    langs = sorted(set(raw_map) & set(ipa_map))
+    return [(lang, raw_map[lang], ipa_map[lang]) for lang in langs]
+
+
+def _load_tokenized_kb_map(filtered_json: Path, method: str) -> Dict[str, float]:
+    """
+    Returns: { lang_script_key -> tokenized_size_kb } for the chosen method.
+
+    Expects rows like:
+      {
+        "lang_script": "eng_Latn",
+        "tokenized_sizes": {"tiktoken": 300.0},
+        ...
+      }
+    """
+    if not filtered_json.exists():
+        raise FileNotFoundError(f"filtered json not found: {filtered_json}")
+
+    rows = json.loads(filtered_json.read_text(encoding="utf-8"))
+    if not isinstance(rows, list):
+        raise ValueError("filtered json must be a list of objects")
+
+    out: Dict[str, float] = {}
+    for r in rows:
+        if not isinstance(r, dict):
+            continue
+
+        key = r.get("lang_script")
+        if not key:
+            # try reconstruct
+            lang = r.get("language")
+            script = r.get("script")
+            if lang and script:
+                key = f"{lang}_{script}"
+
+        if not key:
+            continue
+
+        tok_map = r.get("tokenized_sizes")
+        if not isinstance(tok_map, dict):
+            continue
+
+        v = tok_map.get(method)
+        if v is None:
+            continue
+
+        try:
+            out[str(key)] = float(v)  # KB
+        except Exception:
+            continue
+
+    return out
+
+
+def make_scatter(stats: List[PairStats], outpath: Optional[Path], title: str) -> None:
+    x = [s.raw_bytes for s in stats]
+    y = [s.ipa_bytes for s in stats]
+    labels = [s.lang for s in stats]
+
+    plt.figure()
+    plt.scatter(x, y)
+
+    for xi, yi, lab in zip(x, y, labels):
+        plt.annotate(lab, (xi, yi), textcoords="offset points", xytext=(6, 4))
+
+    plt.xlabel("Raw text size (UTF-8 bytes)")
+    plt.ylabel("IPA text size (UTF-8 bytes)")
+    plt.title(title)
+    plt.grid(True, linestyle="--", linewidth=0.5)
+
+    if outpath:
+        plt.tight_layout()
+        plt.savefig(outpath, dpi=200)
+        plt.close()
+    else:
+        plt.show()
+
+
+def make_bar(
+    stats: List[PairStats],
+    values: List[float],
+    ylabel: str,
+    outpath: Optional[Path],
+    title: str,
+) -> None:
+    langs = [s.lang for s in stats]
+    plt.figure(figsize=(max(8, 0.8 * len(langs)), 5))
+    plt.bar(langs, values)
+    plt.ylabel(ylabel)
+    plt.title(title)
+    plt.xticks(rotation=35, ha="right")
+    plt.grid(True, axis="y", linestyle="--", linewidth=0.5)
+
+    if outpath:
+        plt.tight_layout()
+        plt.savefig(outpath, dpi=200)
+        plt.close()
+    else:
+        plt.show()
+
+
+def _mean_std(vals: List[float]) -> Tuple[float, float]:
+    """
+    Population mean/std (ddof=0) over vals.
+    """
+    if not vals:
+        return 0.0, 0.0
+    m = sum(vals) / len(vals)
+    var = sum((v - m) ** 2 for v in vals) / len(vals)
+    return m, math.sqrt(var)
+
+
+def make_back_to_back_bar(
+    stats: List[PairStats],
+    outpath: Optional[Path],
+    title: str = "Raw vs IPA Text Size (UTF-8 bytes)",
+) -> None:
+    """
+    Back-to-back horizontal bar chart:
+        - Raw text on the left (negative)
+      - IPA text on the right (positive)
+
+    Adds:
+        - dotted mean lines for raw and ipa
+      - dotted ±1 stddev lines for raw and ipa
+    """
+    langs = [s.lang for s in stats]
+    raw_bytes = [float(s.raw_bytes) for s in stats]
+    ipa_bytes = [float(s.ipa_bytes) for s in stats]
+
+    raw_vals = [-b for b in raw_bytes]   # negative for left side
+    ipa_vals = ipa_bytes                # positive for right side
+
+    raw_mean, raw_std = _mean_std(raw_bytes)
+    ipa_mean, ipa_std = _mean_std(ipa_bytes)
+
+    y = range(len(langs))
+
+    plt.figure(figsize=(10, max(5, 0.5 * len(langs))))
+    plt.barh(y, raw_vals, label="Raw text", alpha=0.7)
+    plt.barh(y, ipa_vals, label="IPA text", alpha=0.7)
+
+    plt.yticks(y, langs)
+    plt.axvline(0, color="black", linewidth=1)
+
+    # Mean lines (dotted)
+    plt.axvline(-raw_mean, linestyle=":", linewidth=2, label=f"Raw mean ({raw_mean:.0f})")
+    plt.axvline(ipa_mean, linestyle=":", linewidth=2, label=f"IPA mean ({ipa_mean:.0f})")
+
+    # ±1 stddev lines (dotted, lighter)
+    plt.axvline(-(raw_mean - raw_std), linestyle=":", linewidth=1)
+    plt.axvline(-(raw_mean + raw_std), linestyle=":", linewidth=1)
+    plt.axvline(ipa_mean - ipa_std, linestyle=":", linewidth=1)
+    plt.axvline(ipa_mean + ipa_std, linestyle=":", linewidth=1)
+
+    plt.xlabel("UTF-8 bytes")
+    plt.title(
+        f"{title}\n"
+        f"Raw mean={raw_mean:.0f}, std={raw_std:.0f} | "
+        f"IPA mean={ipa_mean:.0f}, std={ipa_std:.0f}"
+    )
+    plt.grid(True, axis="x", linestyle="--", linewidth=0.5)
+
+    max_val = max(max(ipa_vals), max(abs(v) for v in raw_vals))
+    max_val = max(max_val, raw_mean + raw_std, ipa_mean + ipa_std)
+    plt.xlim(-max_val * 1.15, max_val * 1.15)
+
+    plt.legend(loc="best")
+
+    if outpath:
+        plt.tight_layout()
+        plt.savefig(outpath, dpi=200)
+        plt.close()
+    else:
+        plt.show()
+
+    print(f"[back-to-back] Raw bytes: mean={raw_mean:.2f}, std={raw_std:.2f}")
+    print(f"[back-to-back] IPA bytes: mean={ipa_mean:.2f}, std={ipa_std:.2f}")
+
+
+def make_grouped_raw_ipa_tok(
+    stats: List[PairStats],
+    outpath: Optional[Path],
+    tok_label: str,
+    title: str = "Raw vs IPA vs Tokenized Size (UTF-8 bytes)",
+) -> None:
+    """
+    Grouped (clustered) vertical bar chart per language:
+      raw, ipa, tok (if present)
+
+    If some rows are missing tok_bytes, we simply omit that bar for that language.
+    """
+    langs = [s.lang for s in stats]
+    raw = [s.raw_bytes for s in stats]
+    ipa = [s.ipa_bytes for s in stats]
+    tok = [s.tok_bytes for s in stats]  # Optional[int]
+
+    x = list(range(len(langs)))
+    width = 0.25
+
+    plt.figure(figsize=(max(10, 0.9 * len(langs)), 5))
+
+    # raw and ipa always present
+    plt.bar([i - width for i in x], raw, width=width, label="Raw")
+    plt.bar([i for i in x], ipa, width=width, label="IPA")
+
+    # tokenized: only where present
+    tok_x = []
+    tok_y = []
+    for i, v in enumerate(tok):
+        if v is not None:
+            tok_x.append(i + width)
+            tok_y.append(v)
+    if tok_x:
+        plt.bar(tok_x, tok_y, width=width, label=tok_label)
+
+    plt.xticks(x, langs, rotation=35, ha="right")
+    plt.ylabel("Bytes (UTF-8)")
+    plt.title(title)
+    plt.grid(True, axis="y", linestyle="--", linewidth=0.5)
+    plt.legend(loc="best")
+
+    plt.tight_layout()
+    if outpath:
+        plt.savefig(outpath, dpi=200)
+        plt.close()
+    else:
+        plt.show()
+
+
+def write_csv(stats: List[PairStats], out_csv: Path) -> None:
+    out_csv.parent.mkdir(parents=True, exist_ok=True)
+    with out_csv.open("w", newline="", encoding="utf-8") as f:
+        w = csv.writer(f)
+        w.writerow(
+            [
+                "lang",
+                "raw_path",
+                "ipa_path",
+                "raw_bytes",
+                "ipa_bytes",
+                "ratio_bytes",
+                "delta_bytes",
+                "raw_chars",
+                "ipa_chars",
+                "raw_lines",
+                "ipa_lines",
+                "tok_bytes",
+            ]
+        )
+        for s in stats:
+            w.writerow(
+                [
+                    s.lang,
+                    str(s.raw_path),
+                    str(s.ipa_path),
+                    s.raw_bytes,
+                    s.ipa_bytes,
+                    f"{s.ratio_bytes:.6f}",
+                    s.delta_bytes,
+                    s.raw_chars,
+                    s.ipa_chars,
+                    s.raw_lines,
+                    s.ipa_lines,
+                    "" if s.tok_bytes is None else s.tok_bytes,
+                ]
+            )
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description="Graph IPA vs raw text sizes across folders.")
+    ap.add_argument("--text-dir", default="text", help="Directory with text_<lang>.txt files (default: text/)")
+    ap.add_argument("--ipa-dir", default="ipa", help="Directory with ipa_text_<lang>.txt files (default: ipa/)")
+    ap.add_argument("--save", action="store_true", help="Save plots instead of showing them.")
+    ap.add_argument("--outdir", default="plots_out", help="Output directory when --save is used.")
+    ap.add_argument("--csv", action="store_true", help="Also write CSV of statistics.")
+    ap.add_argument(
+        "--sort",
+        choices=["lang", "raw_bytes", "ipa_bytes", "ratio", "delta"],
+        default="lang",
+        help="Sort order for plots.",
+    )
+    ap.add_argument(
+        "--title",
+        default="IPA vs Raw Text Size (UTF-8 bytes)",
+        help="Title for scatter plot.",
+    )
+
+    # NEW: load tokenized sizes (KB) from filtered json and compare in bytes
+    ap.add_argument(
+        "--filtered-json",
+        default=None,
+        help="Optional: filtered_scripts.json (annotated) containing tokenized_sizes (KB). "
+             "If set, we will add a Raw vs IPA vs Tokenized plot.",
+    )
+    ap.add_argument(
+        "--tok-method",
+        default="tiktoken",
+        help="Which tokenized_sizes[method] to load from filtered json (default: tiktoken).",
+    )
+    ap.add_argument(
+        "--skip-missing-tok",
+        action="store_true",
+        help="If set, drop languages that don't have tokenized_sizes[tok-method]. "
+             "Default: keep language but omit tok bar.",
+    )
+
+    args = ap.parse_args()
+
+    text_dir = Path(args.text_dir)
+    ipa_dir = Path(args.ipa_dir)
+
+    if not text_dir.exists():
+        raise SystemExit(f"text-dir not found: {text_dir}")
+    if not ipa_dir.exists():
+        raise SystemExit(f"ipa-dir not found: {ipa_dir}")
+
+    pairs = discover_pairs(text_dir, ipa_dir)
+    if not pairs:
+        raise SystemExit("No matching text_/ipa_text_ pairs found.")
+
+    tok_kb_map: Dict[str, float] = {}
+    if args.filtered_json:
+        tok_kb_map = _load_tokenized_kb_map(Path(args.filtered_json), method=args.tok_method)
+
+    stats: List[PairStats] = []
+    for lang, raw_p, ipa_p in pairs:
+        rb, rc, rl = read_stats(raw_p)
+        ib, ic, il = read_stats(ipa_p)
+
+        tok_bytes: Optional[int] = None
+        if tok_kb_map:
+            kb = tok_kb_map.get(lang)
+            if kb is not None:
+                tok_bytes = int(round(kb * 1024.0))
+
+        # optionally drop missing tokenized values
+        if args.skip_missing_tok and tok_kb_map and tok_bytes is None:
+            continue
+
+        stats.append(
+            PairStats(
+                lang=lang,
+                raw_path=raw_p,
+                ipa_path=ipa_p,
+                raw_bytes=rb,
+                ipa_bytes=ib,
+                raw_chars=rc,
+                ipa_chars=ic,
+                raw_lines=rl,
+                ipa_lines=il,
+                tok_bytes=tok_bytes,
+            )
+        )
+
+    # Sorting
+    key_map = {
+        "lang": lambda s: s.lang,
+        "raw_bytes": lambda s: s.raw_bytes,
+        "ipa_bytes": lambda s: s.ipa_bytes,
+        "ratio": lambda s: s.ratio_bytes,
+        "delta": lambda s: s.delta_bytes,
+    }
+    stats.sort(key=key_map[args.sort])
+
+    outdir = Path(args.outdir)
+    if args.save:
+        outdir.mkdir(parents=True, exist_ok=True)
+
+    make_scatter(
+        stats,
+        outdir / "scatter_ipa_vs_raw_bytes.png" if args.save else None,
+        args.title,
+    )
+
+    make_bar(
+        stats,
+        [s.ratio_bytes for s in stats],
+        "IPA / Raw (bytes)",
+        outdir / "bar_ratio_ipa_over_raw.png" if args.save else None,
+        "IPA expansion ratio by language",
+    )
+
+    make_bar(
+        stats,
+        [float(s.delta_bytes) for s in stats],
+        "IPA - Raw (bytes)",
+        outdir / "bar_delta_ipa_minus_raw.png" if args.save else None,
+        "Absolute size increase (IPA − Raw)",
+    )
+
+    make_back_to_back_bar(
+        stats,
+        outdir / "bar_back_to_back_raw_vs_ipa.png" if args.save else None,
+        title="Raw vs IPA Text Size by Language (UTF-8 bytes)",
+    )
+
+    # NEW: grouped raw vs ipa vs tokenized (if filtered_json provided and any matches exist)
+    if tok_kb_map:
+        any_tok = any(s.tok_bytes is not None for s in stats)
+        if any_tok:
+            make_grouped_raw_ipa_tok(
+                stats,
+                outdir / f"bar_grouped_raw_ipa_{args.tok_method}.png" if args.save else None,
+                tok_label=args.tok_method,
+                title=f"Raw vs IPA vs {args.tok_method} (bytes)",
+            )
+        else:
+            print(f"[warn] --filtered-json provided but no tokenized_sizes['{args.tok_method}'] matched your lang keys.")
+
+    if args.save and args.csv:
+        write_csv(stats, outdir / "ipa_vs_raw_stats.csv")
+
+    for s in stats:
+        tok_str = "n/a" if s.tok_bytes is None else str(s.tok_bytes)
+        print(
+            f"{s.lang:14s} raw={s.raw_bytes:8d} "
+            f"ipa={s.ipa_bytes:8d} "
+            f"tok({args.tok_method})={tok_str:>8s} "
+            f"ratio={s.ratio_bytes:6.3f} "
+            f"delta={s.delta_bytes:8d}"
+        )
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/data/flores200-res/plot_langscript_sizes_grouped.py b/data/flores200-res/plot_langscript_sizes_grouped.py
new file mode 100644
index 0000000000..c1462018c1
--- /dev/null
+++ b/data/flores200-res/plot_langscript_sizes_grouped.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+import json
+import re
+from collections import Counter, defaultdict
+import matplotlib.pyplot as plt
+
+# text_<lang>_<script>.txt
+FNAME_RE = re.compile(r"^text_([a-z]{3})_([A-Za-z]{4})\.txt$")
+
+def parse_size_to_kb(size_str: str) -> float:
+    m = re.match(r"^\s*([0-9]*\.?[0-9]+)\s*([KMGTP]?)(B?)\s*$", size_str, re.IGNORECASE)
+    if not m:
+        raise ValueError(f"Bad size: {size_str!r}")
+    val = float(m.group(1))
+    unit = m.group(2).upper()
+    mult = {
+        "": 1 / 1024,
+        "K": 1,
+        "M": 1024,
+        "G": 1024**2,
+        "T": 1024**3,
+        "P": 1024**4,
+    }[unit]
+    return val * mult
+
+def script_to_region(script: str) -> str:
+    mena = {"Arab", "Hebr"}
+    south_asia = {"Deva", "Beng", "Gujr", "Guru", "Orya", "Knda", "Mlym"}
+    east_asia = {"Jpan", "Hang", "Hani", "Hans", "Hant"}
+    se_asia = {"Khmr", "Laoo", "Mymr", "Thai"}
+    eurasia = {"Cyrl", "Grek", "Armn", "Geor"}
+    horn_africa = {"Ethi"}
+    himalaya = {"Tibt"}
+
+    if script in mena:
+        return "MENA"
+    if script in south_asia:
+        return "South Asia"
+    if script in east_asia:
+        return "East Asia"
+    if script in se_asia:
+        return "Southeast Asia"
+    if script in eurasia:
+        return "Eurasia"
+    if script in horn_africa:
+        return "Horn of Africa"
+    if script in himalaya:
+        return "Himalaya"
+    if script == "Latn":
+        return "Latin (global)"
+    return f"Other ({script})"
+
+def script_to_family(script: str) -> str:
+    """
+    Coarser “language grouping” / writing-system-family buckets.
+
+    Examples requested:
+      - Arab + Hebr + Ethi -> one group (Semitic scripts)
+      - Hans + Hant (+Hani) -> one group (Han scripts)
+    """
+    semitic_scripts = {"Arab", "Hebr", "Ethi"}  # per your request (Amharic uses Ethiopic)
+    han_scripts = {"Hans", "Hant", "Hani"}
+    japanese = {"Jpan"}      # could fold into Han if you want, but keeping separate by default
+    korean = {"Hang"}        # separate by default
+    indic = {"Deva", "Beng", "Gujr", "Guru", "Orya", "Knda", "Mlym", "Taml", "Telu", "Sinh"}
+    se_asia = {"Khmr", "Laoo", "Mymr", "Thai"}
+    cyrillic = {"Cyrl"}
+    greek = {"Grek"}
+    caucasus = {"Armn", "Geor"}
+    tibetan = {"Tibt"}
+
+    if script in semitic_scripts:
+        return "Semitic scripts (Arab/Hebr/Ethi)"
+    if script in han_scripts:
+        return "Han scripts (Hans/Hant/Hani)"
+    if script in japanese:
+        return "Japanese (Jpan)"
+    if script in korean:
+        return "Korean (Hang)"
+    if script in indic:
+        return "Indic scripts"
+    if script in se_asia:
+        return "Mainland SEA scripts"
+    if script in cyrillic:
+        return "Cyrillic"
+    if script in greek:
+        return "Greek"
+    if script in caucasus:
+        return "Caucasus scripts (Armn/Geor)"
+    if script in tibetan:
+        return "Tibetan (Tibt)"
+    if script == "Latn":
+        return "Latin"
+    return f"Other ({script})"
+
+def main():
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--json", default="files.json", help="Input JSON listing")
+    ap.add_argument("--out", default=None, help="Save figure to this path (png/pdf/etc)")
+    ap.add_argument("--top-n", type=int, default=0, help="If >0, plot only top N entries by KB")
+    ap.add_argument("--group-by", choices=["region", "script", "family"], default="region",
+                    help="How to group entries on the Y axis (blocks)")
+    ap.add_argument("--color-by", choices=["region", "script", "family"], default=None,
+                    help="How to color bars. Default: same as --group-by")
+    args = ap.parse_args()
+
+    if args.color_by is None:
+        args.color_by = args.group_by
+
+    with open(args.json, "r", encoding="utf-8") as f:
+        rows = json.load(f)
+
+    # Aggregate by language-script key (no double counting across scripts)
+    ls_kb = Counter()
+    ls_script = {}
+    ls_region = {}
+    ls_family = {}
+
+    for r in rows:
+        m = FNAME_RE.match(r.get("name", ""))
+        if not m:
+            continue
+        lang, script = m.groups()
+        key = f"{lang}_{script}"
+        kb = parse_size_to_kb(r["size"])
+
+        ls_kb[key] += kb
+        ls_script[key] = script
+        ls_region[key] = script_to_region(script)
+        ls_family[key] = script_to_family(script)
+
+    items = ls_kb.most_common()
+    if args.top_n and args.top_n > 0:
+        items = items[:args.top_n]
+
+    def get_label(key: str, which: str) -> str:
+        if which == "region":
+            return ls_region[key]
+        if which == "script":
+            return ls_script[key]
+        if which == "family":
+            return ls_family[key]
+        raise ValueError(which)
+
+    # Group and color labels
+    groups = defaultdict(list)
+    for key, kb in items:
+        groups[get_label(key, args.group_by)].append((key, kb))
+
+    group_order = sorted(groups.keys(), key=lambda g: sum(v for _, v in groups[g]), reverse=True)
+
+    ordered_keys = []
+    ordered_vals = []
+    ordered_group_labels = []
+    ordered_color_labels = []
+
+    for g in group_order:
+        for key, kb in sorted(groups[g], key=lambda x: x[1], reverse=True):
+            ordered_keys.append(key)
+            ordered_vals.append(kb)
+            ordered_group_labels.append(g)
+            ordered_color_labels.append(get_label(key, args.color_by))
+
+    # Color map (use matplotlib default cycle)
+    palette = plt.rcParams["axes.prop_cycle"].by_key().get("color", [])
+    if not palette:
+        palette = ["C0","C1","C2","C3","C4","C5","C6","C7","C8","C9"]
+
+    unique_color_labels = []
+    for cl in ordered_color_labels:
+        if cl not in unique_color_labels:
+            unique_color_labels.append(cl)
+
+    color_map = {cl: palette[i % len(palette)] for i, cl in enumerate(unique_color_labels)}
+    bar_colors = [color_map[cl] for cl in ordered_color_labels]
+
+    # Plot
+    fig_h = max(6, 0.28 * len(ordered_keys) + 1.5)
+    plt.figure(figsize=(12, fig_h))
+
+    y = range(len(ordered_keys))
+    plt.barh(y, ordered_vals, color=bar_colors)
+    plt.yticks(y, ordered_keys)
+    plt.xlabel("Total size (KB)")
+    plt.title(f"Language–script sizes (KB) | grouped by {args.group_by} | colored by {args.color_by}")
+
+    # Group separators + labels
+    start = 0
+    for g in group_order:
+        count = len(groups[g])
+        if start > 0:
+            plt.axhline(start - 0.5, linewidth=1)
+        mid = start + (count - 1) / 2
+        plt.text(
+            0, mid, f"  {g}",
+            va="center",
+            ha="left",
+            fontsize=10,
+            transform=plt.gca().get_yaxis_transform()
+        )
+        start += count
+
+    # Legend
+    handles = [plt.Line2D([0], [0], marker="s", linestyle="", color=color_map[cl]) for cl in unique_color_labels]
+    plt.legend(handles, unique_color_labels, title=args.color_by, loc="lower right")
+
+    plt.tight_layout()
+    if args.out:
+        plt.savefig(args.out, dpi=200)
+    else:
+        plt.show()
+
+if __name__ == "__main__":
+    main()
+
diff --git a/data/flores200-res/plot_multi_script_languages.py b/data/flores200-res/plot_multi_script_languages.py
new file mode 100644
index 0000000000..cc50455a26
--- /dev/null
+++ b/data/flores200-res/plot_multi_script_languages.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""
+plot_multi_script_languages.py
+
+Plot ONLY languages that appear in multiple scripts, grouped by language,
+with one bar per (language, script) variant, and bars colored by script.
+
+Fixed color mapping requested:
+  - Latn => blue
+  - Arab => green
+  - Deva => orange
+All other scripts get deterministic fallback colors from matplotlib's cycle.
+
+Input JSON format: list of dicts containing at least:
+  {"size":"383K", "name":"text_kas_Arab.txt", ...}
+
+Usage:
+  python3 plot_multi_script_languages.py --json files.json
+  python3 plot_multi_script_languages.py --json files.json --out multi_script.png
+  python3 plot_multi_script_languages.py --min-scripts 2 --sort total_kb
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+from collections import Counter, defaultdict
+from typing import Dict, List, Tuple
+
+import matplotlib.pyplot as plt
+
+# Expected filename: text_<lang>_<script>.txt
+FNAME_RE = re.compile(r"^text_([a-z]{3})_([A-Za-z]{4})\.txt$")
+
+
+def parse_size_to_kb(size_str: str) -> float:
+    """
+    Convert ls -h-ish sizes to KB (float):
+      "383K" -> 383
+      "1.2M" -> 1228.8
+      "900"  -> treated as bytes -> 0.8789 KB (rare)
+    """
+    m = re.match(r"^\s*([0-9]*\.?[0-9]+)\s*([KMGTP]?)(B?)\s*$", size_str, re.IGNORECASE)
+    if not m:
+        raise ValueError(f"Unrecognized size string: {size_str!r}")
+
+    val = float(m.group(1))
+    unit = m.group(2).upper()
+
+    mult = {
+        "": 1.0 / 1024.0,  # bytes -> KB
+        "K": 1.0,
+        "M": 1024.0,
+        "G": 1024.0**2,
+        "T": 1024.0**3,
+        "P": 1024.0**4,
+    }[unit]
+    return val * mult
+
+
+def script_to_fixed_color(script: str) -> str | None:
+    """
+    Fixed, semantically meaningful colors.
+    Using Matplotlib's classic hexes for consistent look.
+    """
+    fixed = {
+        "Latn": "#1f77b4",  # blue
+        "Arab": "#2ca02c",  # green
+        "Deva": "#ff7f0e",  # orange
+    }
+    return fixed.get(script)
+
+
+def get_fallback_palette() -> List[str]:
+    palette = plt.rcParams.get("axes.prop_cycle", None)
+    if palette is not None:
+        colors = palette.by_key().get("color", [])
+        if colors:
+            return list(colors)
+    # last-resort fallback
+    return ["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9"]
+
+
+def build_script_color_map(scripts_in_use: List[str]) -> Dict[str, str]:
+    """
+    Deterministic mapping:
+      - apply fixed colors first
+      - remaining scripts assigned in sorted order from fallback palette
+    """
+    palette = get_fallback_palette()
+    out: Dict[str, str] = {}
+
+    # fixed first
+    remaining = []
+    for s in scripts_in_use:
+        fx = script_to_fixed_color(s)
+        if fx is not None:
+            out[s] = fx
+        else:
+            remaining.append(s)
+
+    # deterministic assignment for remaining scripts
+    remaining_sorted = sorted(set(remaining))
+    for i, s in enumerate(remaining_sorted):
+        out[s] = palette[i % len(palette)]
+
+    return out
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--json", default="files.json", help="Input JSON listing")
+    ap.add_argument("--out", default=None, help="If set, save figure here (png/pdf/etc)")
+    ap.add_argument(
+        "--min-scripts",
+        type=int,
+        default=2,
+        help="Keep languages with >= this many distinct scripts (default 2)",
+    )
+    ap.add_argument(
+        "--sort",
+        choices=["total_kb", "lang"],
+        default="total_kb",
+        help="Order language blocks by total size or alphabetically",
+    )
+    ap.add_argument(
+        "--top-langs",
+        type=int,
+        default=0,
+        help="If >0, keep only top N multi-script languages by total KB",
+    )
+    args = ap.parse_args()
+
+    with open(args.json, "r", encoding="utf-8") as f:
+        rows = json.load(f)
+
+    # Aggregate KB by (lang, script)
+    lang_script_kb: Counter[Tuple[str, str]] = Counter()
+    lang_to_scripts: Dict[str, set] = defaultdict(set)
+
+    for r in rows:
+        name = r.get("name", "")
+        m = FNAME_RE.match(name)
+        if not m:
+            continue
+        lang, script = m.groups()
+        kb = parse_size_to_kb(str(r["size"]))
+        lang_script_kb[(lang, script)] += kb
+        lang_to_scripts[lang].add(script)
+
+    # Identify languages that appear in multiple scripts
+    multi_langs = [lang for lang, scripts in lang_to_scripts.items() if len(scripts) >= args.min_scripts]
+    if not multi_langs:
+        raise SystemExit("No multi-script languages found in this files.json (given --min-scripts).")
+
+    # Total KB per language (for sorting)
+    lang_total_kb: Dict[str, float] = {}
+    for lang in multi_langs:
+        lang_total_kb[lang] = sum(lang_script_kb[(lang, s)] for s in lang_to_scripts[lang])
+
+    # Sort language blocks
+    if args.sort == "total_kb":
+        multi_langs.sort(key=lambda l: lang_total_kb[l], reverse=True)
+    else:
+        multi_langs.sort()
+
+    # Optional: top N languages
+    if args.top_langs and args.top_langs > 0:
+        multi_langs = multi_langs[: args.top_langs]
+
+    # Scripts used among these multi-script languages
+    scripts_in_use: List[str] = []
+    for lang in multi_langs:
+        for s in lang_to_scripts[lang]:
+            if s not in scripts_in_use:
+                scripts_in_use.append(s)
+
+    script_color = build_script_color_map(scripts_in_use)
+
+    # Build plotting rows: one bar per (lang, script), grouped by lang
+    y_labels: List[str] = []
+    x_vals: List[float] = []
+    colors: List[str] = []
+
+    # For labeling language blocks and separator lines
+    lang_midpoints: List[Tuple[str, float]] = []
+    separators: List[int] = []
+    y_pos = 0
+
+    for lang in multi_langs:
+        scripts = sorted(lang_to_scripts[lang], key=lambda s: lang_script_kb[(lang, s)], reverse=True)
+        start = y_pos
+
+        for s in scripts:
+            y_labels.append(f"{lang}_{s}")
+            x_vals.append(lang_script_kb[(lang, s)])
+            colors.append(script_color.get(s, "C0"))
+            y_pos += 1
+
+        end = y_pos
+        lang_midpoints.append((lang, (start + end - 1) / 2))
+        separators.append(end)
+
+    # Plot
+    fig_h = max(5, 0.35 * len(y_labels) + 1.5)
+    plt.figure(figsize=(12, fig_h))
+    plt.barh(range(len(y_labels)), x_vals, color=colors)
+    plt.yticks(range(len(y_labels)), y_labels)
+    plt.xlabel("Total size (KB)")
+    plt.title("Multi-script languages: grouped by language, colored by script")
+
+    # Separators between language blocks
+    for cut in separators[:-1]:
+        plt.axhline(cut - 0.5, linewidth=1)
+
+    # Language labels on left, centered per block
+    for lang, mid in lang_midpoints:
+        plt.text(
+            0,
+            mid,
+            f"  {lang}",
+            va="center",
+            ha="left",
+            fontsize=10,
+            transform=plt.gca().get_yaxis_transform(),  # x in axes coords, y in data coords
+        )
+
+    # Legend (scripts), deterministic order: fixed first, then alphabetical
+    fixed_order = ["Latn", "Arab", "Deva"]
+    rest = sorted([s for s in script_color.keys() if s not in fixed_order])
+    legend_scripts = [s for s in fixed_order if s in script_color] + rest
+
+    handles = [
+        plt.Line2D([0], [0], marker="s", linestyle="", color=script_color[s], label=s)
+        for s in legend_scripts
+    ]
+    plt.legend(handles=handles, title="Script", loc="lower right")
+
+    plt.tight_layout()
+    if args.out:
+        plt.savefig(args.out, dpi=200)
+    else:
+        plt.show()
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/data/flores200-res/plot_tokenization_vs_original.py b/data/flores200-res/plot_tokenization_vs_original.py
new file mode 100644
index 0000000000..d282fb6549
--- /dev/null
+++ b/data/flores200-res/plot_tokenization_vs_original.py
@@ -0,0 +1,360 @@
+#!/usr/bin/env python3
+"""
+plot_tokenization_vs_original.py
+
+Reads the *annotated* filtered JSON (from tokenize_and_annotate_sizes.py),
+and plots either:
+
+  (A) tokenized size in KB (default: tiktoken), OR
+  (B) ratio of tokenized/original (default), OR
+  (C) ratio of original/tokenized (optional)
+
+in the SAME grouped order + SAME color semantics as your previous plots.
+
+Grouping options:
+  --group-by {region, script, family}
+Coloring options:
+  --color-by {region, script, family} (default: same as group-by)
+
+If you color by script:
+  - Latn = blue
+  - Arab = green
+  - Deva = orange
+  - others fall back deterministically to matplotlib cycle
+
+Input entry format expected (per row):
+{
+  "language": "ace",
+  "script": "Latn",
+  "lang_script": "ace_Latn",
+  "size_kb": 277.0,
+  "tokenized_sizes": {"tiktoken": 300.0, "byte": 277.0},
+  "filename": "text_ace_Latn.txt"
+}
+
+Examples:
+  # plot tokenized KB (tiktoken) grouped by region
+  python3 plot_tokenization_vs_original.py --json filtered_scripts.json --mode tokenized_kb --method tiktoken
+
+  # plot ratio (tiktoken/original) grouped by family, colored by family
+  python3 plot_tokenization_vs_original.py --json filtered_scripts.json --mode ratio --method tiktoken --group-by family --color-by family
+
+  # plot ratio (original/tiktoken) grouped by script, colored by script
+  python3 plot_tokenization_vs_original.py --mode ratio --ratio-kind orig_over_tok --group-by script --color-by script
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from collections import defaultdict
+from typing import Any, Dict, List, Tuple
+
+import matplotlib.pyplot as plt
+
+
+def script_to_region(script: str) -> str:
+    mena = {"Arab", "Hebr"}
+    south_asia = {"Deva", "Beng", "Gujr", "Guru", "Orya", "Knda", "Mlym"}
+    east_asia = {"Jpan", "Hang", "Hani", "Hans", "Hant"}
+    se_asia = {"Khmr", "Laoo", "Mymr", "Thai"}
+    eurasia = {"Cyrl", "Grek", "Armn", "Geor"}
+    horn_africa = {"Ethi"}
+    himalaya = {"Tibt"}
+
+    if script in mena:
+        return "MENA"
+    if script in south_asia:
+        return "South Asia"
+    if script in east_asia:
+        return "East Asia"
+    if script in se_asia:
+        return "Southeast Asia"
+    if script in eurasia:
+        return "Eurasia"
+    if script in horn_africa:
+        return "Horn of Africa"
+    if script in himalaya:
+        return "Himalaya"
+    if script == "Latn":
+        return "Latin (global)"
+    return f"Other ({script})"
+
+
+def script_to_family(script: str) -> str:
+    # Coarser “language grouping” you asked for earlier
+    semitic_scripts = {"Arab", "Hebr", "Ethi"}  # per your earlier preference
+    han_scripts = {"Hans", "Hant", "Hani"}
+    japanese = {"Jpan"}
+    korean = {"Hang"}
+    indic = {"Deva", "Beng", "Gujr", "Guru", "Orya", "Knda", "Mlym", "Taml", "Telu", "Sinh"}
+    se_asia = {"Khmr", "Laoo", "Mymr", "Thai"}
+    cyrillic = {"Cyrl"}
+    greek = {"Grek"}
+    caucasus = {"Armn", "Geor"}
+    tibetan = {"Tibt"}
+
+    if script in semitic_scripts:
+        return "Semitic scripts (Arab/Hebr/Ethi)"
+    if script in han_scripts:
+        return "Han scripts (Hans/Hant/Hani)"
+    if script in japanese:
+        return "Japanese (Jpan)"
+    if script in korean:
+        return "Korean (Hang)"
+    if script in indic:
+        return "Indic scripts"
+    if script in se_asia:
+        return "Mainland SEA scripts"
+    if script in cyrillic:
+        return "Cyrillic"
+    if script in greek:
+        return "Greek"
+    if script in caucasus:
+        return "Caucasus scripts (Armn/Geor)"
+    if script in tibetan:
+        return "Tibetan (Tibt)"
+    if script == "Latn":
+        return "Latin"
+    return f"Other ({script})"
+
+
+def script_to_fixed_color(script: str) -> str | None:
+    # fixed colors you wanted
+    fixed = {
+        "Latn": "#1f77b4",  # blue
+        "Arab": "#2ca02c",  # green
+        "Deva": "#ff7f0e",  # orange
+    }
+    return fixed.get(script)
+
+
+def get_palette() -> List[str]:
+    colors = plt.rcParams.get("axes.prop_cycle", None)
+    if colors is not None:
+        arr = colors.by_key().get("color", [])
+        if arr:
+            return list(arr)
+    return ["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9"]
+
+
+def build_color_map(labels_in_order: List[str], mode: str, script_for_key: Dict[str, str]) -> Dict[str, str]:
+    """
+    Build a deterministic color map for color-by labels.
+
+    If color mode is 'script', apply fixed mapping for Latn/Arab/Deva.
+    Otherwise use palette in first-seen order.
+    """
+    palette = get_palette()
+    out: Dict[str, str] = {}
+
+    if mode == "script":
+        # fixed first for known scripts
+        # labels_in_order here are script names (e.g. Latn, Arab)
+        fallback_i = 0
+        for lab in labels_in_order:
+            fx = script_to_fixed_color(lab)
+            if fx is not None:
+                out[lab] = fx
+            else:
+                out[lab] = palette[fallback_i % len(palette)]
+                fallback_i += 1
+        return out
+
+    # region/family: first-seen order
+    for i, lab in enumerate(labels_in_order):
+        out[lab] = palette[i % len(palette)]
+    return out
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--json", default="filtered_scripts.json", help="Annotated filtered json")
+    ap.add_argument("--out", default=None, help="Save plot to this path (png/pdf/etc)")
+
+    ap.add_argument("--method", default="tiktoken",
+                    help="Which tokenization in tokenized_sizes to use (default: tiktoken)")
+    ap.add_argument("--mode", choices=["tokenized_kb", "ratio"], default="ratio",
+                    help="Plot tokenized KB or ratio relative to original")
+    ap.add_argument("--ratio-kind", choices=["tok_over_orig", "orig_over_tok"], default="tok_over_orig",
+                    help="If --mode ratio: which ratio to plot")
+
+    ap.add_argument("--group-by", choices=["region", "script", "family"], default="region",
+                    help="How to group entries on the Y axis (blocks)")
+    ap.add_argument("--color-by", choices=["region", "script", "family"], default=None,
+                    help="How to color bars (default: same as group-by)")
+    ap.add_argument("--top-n", type=int, default=0,
+                    help="If >0, plot only top N rows by plotted value within the overall list")
+
+    ap.add_argument("--skip-missing", action="store_true",
+                    help="Skip rows missing tokenized_sizes[method] (default: skip anyway, but quieter)")
+    ap.add_argument("--epsilon", type=float, default=1e-9,
+                    help="Small value to avoid divide-by-zero if size_kb is 0")
+
+    args = ap.parse_args()
+    if args.color_by is None:
+        args.color_by = args.group_by
+
+    with open(args.json, "r", encoding="utf-8") as f:
+        rows: List[Dict[str, Any]] = json.load(f)
+
+    # Build per-row derived fields
+    items: List[Tuple[str, float, str, str, str]] = []
+    # tuple: (lang_script, value, script, region, family)
+    missing = 0
+
+    for r in rows:
+        lang_script = r.get("lang_script")
+        script = r.get("script")
+        if not lang_script or not script:
+            # allow reconstruction if needed
+            lang = r.get("language")
+            if lang and script:
+                lang_script = f"{lang}_{script}"
+            else:
+                continue
+
+        original_kb = float(r.get("size_kb", 0.0))
+        tok_map = r.get("tokenized_sizes", {})
+        tok_kb = None
+        if isinstance(tok_map, dict):
+            v = tok_map.get(args.method)
+            if v is not None:
+                tok_kb = float(v)
+
+        if tok_kb is None:
+            missing += 1
+            if not args.skip_missing:
+                pass
+            continue
+
+        if args.mode == "tokenized_kb":
+            value = tok_kb
+        else:
+            denom = original_kb if args.ratio_kind == "tok_over_orig" else tok_kb
+            num = tok_kb if args.ratio_kind == "tok_over_orig" else original_kb
+            value = num / max(denom, args.epsilon)
+
+        region = script_to_region(script)
+        family = script_to_family(script)
+        items.append((lang_script, value, script, region, family))
+
+    if not items:
+        raise SystemExit(f"No rows had tokenized_sizes['{args.method}']. Missing={missing}")
+
+    # Group label helpers
+    def pick_label(which: str, script: str, region: str, family: str) -> str:
+        if which == "script":
+            return script
+        if which == "region":
+            return region
+        if which == "family":
+            return family
+        raise ValueError(which)
+
+    # Group into blocks, order blocks by total value, and within block by value desc
+    groups = defaultdict(list)  # label -> list[(lang_script, value, script, region, family)]
+    for (ls, val, sc, reg, fam) in items:
+        g = pick_label(args.group_by, sc, reg, fam)
+        groups[g].append((ls, val, sc, reg, fam))
+
+    group_order = sorted(groups.keys(), key=lambda g: sum(x[1] for x in groups[g]), reverse=True)
+
+    ordered: List[Tuple[str, float, str, str, str]] = []
+    for g in group_order:
+        ordered.extend(sorted(groups[g], key=lambda x: x[1], reverse=True))
+
+    # Optional top-n by plotted value (after global ordering)
+    if args.top_n and args.top_n > 0:
+        ordered = ordered[: args.top_n]
+
+    # Build color label list in first-seen order (for deterministic legend mapping)
+    color_labels_seen: List[str] = []
+    script_for_key: Dict[str, str] = {}
+    ordered_color_labels: List[str] = []
+
+    for (ls, val, sc, reg, fam) in ordered:
+        cl = pick_label(args.color_by, sc, reg, fam)
+        ordered_color_labels.append(cl)
+        if cl not in color_labels_seen:
+            color_labels_seen.append(cl)
+        script_for_key[ls] = sc
+
+    color_map = build_color_map(color_labels_seen, args.color_by, script_for_key)
+    bar_colors = [color_map[cl] for cl in ordered_color_labels]
+
+    # Plot
+    labels = [x[0] for x in ordered]
+    values = [x[1] for x in ordered]
+    scripts = [x[2] for x in ordered]
+    regions = [x[3] for x in ordered]
+    families = [x[4] for x in ordered]
+
+    fig_h = max(6, 0.28 * len(labels) + 1.5)
+    plt.figure(figsize=(12, fig_h))
+
+    y = range(len(labels))
+    plt.barh(y, values, color=bar_colors)
+    plt.yticks(y, labels)
+
+    if args.mode == "tokenized_kb":
+        plt.xlabel(f"Tokenized size (KB): {args.method}")
+        title = f"Tokenized size (KB) | method={args.method} | grouped by {args.group_by} | colored by {args.color_by}"
+    else:
+        if args.ratio_kind == "tok_over_orig":
+            plt.xlabel(f"Ratio: {args.method} / original (KB)")
+        else:
+            plt.xlabel(f"Ratio: original / {args.method} (KB)")
+        title = f"Tokenization ratio | method={args.method} | grouped by {args.group_by} | colored by {args.color_by}"
+
+    plt.title(title)
+
+    # Add group separators + group labels (same style as before)
+    # Recompute block boundaries based on the currently plotted rows
+    def row_group_label(i: int) -> str:
+        sc, reg, fam = scripts[i], regions[i], families[i]
+        return pick_label(args.group_by, sc, reg, fam)
+
+    starts: List[Tuple[str, int, int]] = []  # (g, start, end)
+    if labels:
+        cur_g = row_group_label(0)
+        start = 0
+        for i in range(1, len(labels)):
+            g = row_group_label(i)
+            if g != cur_g:
+                starts.append((cur_g, start, i))
+                cur_g = g
+                start = i
+        starts.append((cur_g, start, len(labels)))
+
+    for (g, s, e) in starts:
+        if s > 0:
+            plt.axhline(s - 0.5, linewidth=1)
+        mid = (s + e - 1) / 2
+        plt.text(
+            0,
+            mid,
+            f"  {g}",
+            va="center",
+            ha="left",
+            fontsize=10,
+            transform=plt.gca().get_yaxis_transform(),
+        )
+
+    # Legend
+    handles = [
+        plt.Line2D([0], [0], marker="s", linestyle="", color=color_map[lab], label=lab)
+        for lab in color_labels_seen
+    ]
+    plt.legend(handles=handles, title=args.color_by, loc="lower right")
+
+    plt.tight_layout()
+    if args.out:
+        plt.savefig(args.out, dpi=200)
+    else:
+        plt.show()
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/data/flores200-res/spm_vocab_freq_dashboard.py b/data/flores200-res/spm_vocab_freq_dashboard.py
new file mode 100644
index 0000000000..5a0bb941d5
--- /dev/null
+++ b/data/flores200-res/spm_vocab_freq_dashboard.py
@@ -0,0 +1,662 @@
+#!/usr/bin/env python3
+"""
+spm_vocab_freq_dashboard.py
+
+Single-script, self-contained HTML dashboard (Plotly + vanilla JS) that shows:
+
+LEFT:
+  - SentencePiece vocab tokens + total frequency across *all* text files in a directory
+  - searchable dropdown to pick a token
+  - click-to-select from the token table
+
+RIGHT (top):
+  - per-file counts for the selected token (bar chart)
+
+RIGHT (bottom):
+  - square similarity heatmap clustering text files by similarity across high-frequency vocab
+    (cosine similarity over TF-IDF on top vocab tokens)
+
+Why we require a .model:
+  - SentencePiece tokenization is not substring matching; to get true token frequencies,
+    we MUST encode text using the SentencePiece model.
+
+Defaults:
+  --vocab trained_spm_model.vocab
+  --model inferred by replacing ".vocab" -> ".model" if not provided
+  --dir   required
+
+Output:
+  vocab_freq_dashboard.html (or --out)
+
+Example:
+  python3 spm_vocab_freq_dashboard.py --dir ./text --vocab trained_spm_model.vocab --heatmap
+  python3 spm_vocab_freq_dashboard.py --dir ./text --heatmap-top-k 500 --recursive
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from collections import Counter
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional
+
+import sentencepiece as spm
+
+# For heatmap similarity: NumPy required (SciPy not needed)
+try:
+    import numpy as np
+except Exception as e:
+    np = None
+    _NUMPY_IMPORT_ERROR = e
+
+
+def infer_model_path_from_vocab(vocab_path: Path) -> Path:
+    if vocab_path.suffix.lower() == ".vocab":
+        return vocab_path.with_suffix(".model")
+    return Path(str(vocab_path) + ".model")
+
+
+def iter_text_files(root: Path, recursive: bool, suffixes: Tuple[str, ...]) -> List[Path]:
+    it = root.rglob("*") if recursive else root.glob("*")
+    files = []
+    for p in it:
+        if p.is_file() and p.suffix.lower() in suffixes:
+            files.append(p)
+    files.sort()
+    return files
+
+
+def count_tokens_in_file(sp: spm.SentencePieceProcessor, path: Path) -> Counter:
+    """
+    Streaming-ish: encode line by line to avoid loading huge files into memory.
+    """
+    c = Counter()
+    with path.open("r", encoding="utf-8", errors="replace") as f:
+        for line in f:
+            ids = sp.encode(line, out_type=int)
+            c.update(ids)
+    return c
+
+
+def human_token(tok: str) -> str:
+    return tok.replace("\t", " ").replace("\n", "\\n")
+
+
+def _build_tfidf_matrix(
+    file_names: List[str],
+    per_file: Dict[str, Counter],
+    token_ids: List[int],
+) -> "np.ndarray":
+    """
+    docs x tokens TF-IDF, L2-normalized per doc
+    """
+    assert np is not None
+    n_docs = len(file_names)
+    n_tok = len(token_ids)
+    if n_docs == 0 or n_tok == 0:
+        return np.zeros((n_docs, n_tok), dtype=np.float32)
+
+    tok_to_col = {tid: j for j, tid in enumerate(token_ids)}
+
+    # document frequency
+    df = np.zeros((n_tok,), dtype=np.int32)
+    for fn in file_names:
+        c = per_file[fn]
+        for tid in c.keys():
+            j = tok_to_col.get(tid)
+            if j is not None:
+                df[j] += 1
+
+    # smooth idf
+    idf = np.log((n_docs + 1.0) / (df.astype(np.float32) + 1.0)) + 1.0
+
+    X = np.zeros((n_docs, n_tok), dtype=np.float32)
+    for i, fn in enumerate(file_names):
+        c = per_file[fn]
+        row = X[i]
+        for tid, cnt in c.items():
+            j = tok_to_col.get(tid)
+            if j is not None:
+                row[j] = float(cnt)
+
+        row *= idf
+
+        # L2 normalize
+        norm = float(np.linalg.norm(row))
+        if norm > 0:
+            row /= norm
+
+    return X
+
+
+def _cosine_similarity_matrix(X: "np.ndarray") -> "np.ndarray":
+    """
+    X assumed rows L2-normalized; cosine similarity = X @ X.T
+    """
+    assert np is not None
+    if X.size == 0:
+        return np.zeros((X.shape[0], X.shape[0]), dtype=np.float32)
+    S = X @ X.T
+    # numerical guard
+    S = np.clip(S, -1.0, 1.0).astype(np.float32)
+    # make diagonal exactly 1
+    n = S.shape[0]
+    for i in range(n):
+        S[i, i] = 1.0
+    return S
+
+
+def _order_by_simple_clustering(S: "np.ndarray") -> List[int]:
+    """
+    Optional: reorder files so similar ones are near each other, without SciPy.
+    Greedy "nearest neighbor chain" heuristic:
+      - start from most "central" (max average similarity)
+      - repeatedly append most similar unused item to the last
+    """
+    assert np is not None
+    n = S.shape[0]
+    if n <= 2:
+        return list(range(n))
+
+    avg = S.mean(axis=1)
+    start = int(np.argmax(avg))
+    order = [start]
+    used = set(order)
+
+    while len(order) < n:
+        last = order[-1]
+        # pick unused with max similarity to last
+        best_j = None
+        best_val = -1e9
+        for j in range(n):
+            if j in used:
+                continue
+            v = float(S[last, j])
+            if v > best_val:
+                best_val = v
+                best_j = j
+        order.append(int(best_j))
+        used.add(int(best_j))
+
+    return order
+
+
+def _build_heatmap_payload(
+    file_names: List[str],
+    per_file: Dict[str, Counter],
+    token_ids_for_heatmap: List[int],
+    reorder: bool,
+) -> Dict:
+    """
+    Returns plotly-ready payload for similarity heatmap.
+    """
+    if np is None:
+        raise RuntimeError(
+            "NumPy is required for heatmap mode.\n"
+            f"Import error: {_NUMPY_IMPORT_ERROR!r}\n"
+            "Install: python3 -m pip install numpy"
+        )
+
+    if len(file_names) < 2:
+        return {"ok": False, "reason": "Need at least 2 files to build a similarity heatmap."}
+
+    X = _build_tfidf_matrix(file_names, per_file, token_ids_for_heatmap)
+    S = _cosine_similarity_matrix(X)
+
+    idx = list(range(len(file_names)))
+    if reorder:
+        idx = _order_by_simple_clustering(S)
+
+    labels = [file_names[i] for i in idx]
+    S2 = S[np.ix_(idx, idx)]
+
+    # convert to nested lists for JSON
+    z = S2.tolist()
+
+    traces = [{
+        "type": "heatmap",
+        "z": z,
+        "x": labels,
+        "y": labels,
+        "zmin": 0.0,
+        "zmax": 1.0,
+        "hovertemplate": "x=%{x}<br>y=%{y}<br>cosine=%{z:.3f}<extra></extra>",
+        # no explicit colorscale specified (Plotly default) to match your earlier “no custom colors” vibe
+    }]
+
+    layout = {
+        "margin": {"l": 120, "r": 20, "t": 40, "b": 120},
+        "title": "File similarity heatmap (TF-IDF on high-freq SPM vocab, cosine similarity)",
+        "xaxis": {"tickangle": 35, "automargin": True},
+        "yaxis": {"automargin": True},
+    }
+
+    return {"ok": True, "traces": traces, "layout": layout}
+
+
+def build_html(
+    title: str,
+    token_rows: List[Dict],
+    per_file_counts: Dict[str, Dict[str, int]],
+    file_order: List[str],
+    default_token_id: int,
+    heatmap_payload: Optional[Dict],
+    out_path: Path,
+) -> None:
+    payload = {
+        "title": title,
+        "tokens": token_rows,
+        "per_file": per_file_counts,
+        "files": file_order,
+        "default_token_id": default_token_id,
+        "heatmap": heatmap_payload,
+    }
+
+    html = f"""<!doctype html>
+<html>
+<head>
+  <meta charset="utf-8" />
+  <title>{title}</title>
+  <script src="https://cdn.plot.ly/plotly-2.30.0.min.js"></script>
+  <style>
+    body {{
+      font-family: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, "Noto Sans", Arial, sans-serif;
+      margin: 0; padding: 0;
+    }}
+    header {{
+      padding: 12px 16px;
+      border-bottom: 1px solid #ddd;
+      display: flex;
+      gap: 12px;
+      align-items: center;
+      flex-wrap: wrap;
+    }}
+    header h1 {{
+      font-size: 16px;
+      margin: 0;
+      flex: 1;
+    }}
+    .container {{
+      display: flex;
+      height: calc(100vh - 58px);
+      gap: 10px;
+      padding: 10px;
+      box-sizing: border-box;
+    }}
+    .panel {{
+      flex: 1;
+      min-width: 380px;
+      border: 1px solid #ddd;
+      border-radius: 10px;
+      padding: 10px;
+      box-sizing: border-box;
+      overflow: hidden;
+      display: flex;
+      flex-direction: column;
+    }}
+    .panel h2 {{
+      font-size: 14px;
+      margin: 0 0 8px 0;
+    }}
+    .controls {{
+      display: flex;
+      gap: 8px;
+      align-items: center;
+      flex-wrap: wrap;
+      margin-bottom: 8px;
+    }}
+    .controls label {{
+      font-size: 12px;
+      color: #333;
+    }}
+    select, input {{
+      font-size: 12px;
+      padding: 6px 8px;
+      border-radius: 8px;
+      border: 1px solid #ccc;
+      outline: none;
+    }}
+    #tableDiv {{
+      flex: 1;
+      min-height: 200px;
+    }}
+    .rightCharts {{
+      display: flex;
+      flex-direction: column;
+      gap: 10px;
+      flex: 1;
+      overflow: hidden;
+    }}
+    #barDiv {{
+      flex: 1;
+      min-height: 200px;
+    }}
+    #heatDiv {{
+      flex: 1;
+      min-height: 260px;
+    }}
+    .note {{
+      font-size: 12px;
+      color: #555;
+      padding: 8px 0 0 0;
+    }}
+    .muted {{
+      color: #777;
+    }}
+  </style>
+</head>
+<body>
+<header>
+  <h1>{title}</h1>
+  <span class="muted">Pick a token to update per-file counts; heatmap shows file similarity via high-frequency vocab.</span>
+</header>
+
+<div class="container">
+  <div class="panel">
+    <h2>Vocab + total frequency (directory aggregate)</h2>
+    <div class="controls">
+      <label for="tokenSelect">Token:</label>
+      <select id="tokenSelect"></select>
+      <label for="searchBox">Search:</label>
+      <input id="searchBox" type="text" placeholder="type to filter tokens..." />
+    </div>
+    <div id="tableDiv"></div>
+    <div class="note">
+      Showing <b>top {len(token_rows)}</b> tokens by frequency.
+    </div>
+  </div>
+
+  <div class="panel">
+    <h2 id="rightTitle">Per-file counts + similarity heatmap</h2>
+    <div class="rightCharts">
+      <div id="barDiv"></div>
+      <div id="heatDiv"></div>
+    </div>
+    <div class="note">
+      Heatmap uses <b>TF-IDF</b> over high-frequency SentencePiece tokens and <b>cosine similarity</b>.
+    </div>
+  </div>
+</div>
+
+<script>
+const DATA = {json.dumps(payload, ensure_ascii=False)};
+
+function fmtTokenRow(t) {{
+  let s = t.token;
+  if (s.length > 60) s = s.slice(0, 57) + "…";
+  return `${{t.id}}: ${{s}} (${{t.count}})`;
+}}
+
+function buildSelectOptions(tokens) {{
+  const sel = document.getElementById("tokenSelect");
+  sel.innerHTML = "";
+  for (const t of tokens) {{
+    const opt = document.createElement("option");
+    opt.value = String(t.id);
+    opt.textContent = fmtTokenRow(t);
+    sel.appendChild(opt);
+  }}
+}}
+
+function filterTokens(tokens, q) {{
+  if (!q) return tokens;
+  q = q.toLowerCase();
+  return tokens.filter(t => String(t.id).includes(q) || (t.token || "").toLowerCase().includes(q));
+}}
+
+function renderTable(tokens) {{
+  const ids = tokens.map(t => t.id);
+  const toks = tokens.map(t => t.token);
+  const counts = tokens.map(t => t.count);
+
+  const tableData = [{{
+    type: "table",
+    header: {{
+      values: ["<b>ID</b>", "<b>Token</b>", "<b>Total Count</b>"],
+      align: ["right", "left", "right"],
+    }},
+    cells: {{
+      values: [ids, toks, counts],
+      align: ["right", "left", "right"],
+      height: 22
+    }}
+  }}];
+
+  Plotly.newPlot("tableDiv", tableData, {{
+    margin: {{l: 10, r: 10, t: 10, b: 10}},
+  }}, {{displayModeBar: false}});
+
+  const tableDiv = document.getElementById("tableDiv");
+  tableDiv.on("plotly_click", (ev) => {{
+    try {{
+      const row = ev.points[0].pointNumber;
+      const tok = tokens[row];
+      if (tok) {{
+        selectToken(String(tok.id), true);
+      }}
+    }} catch (e) {{}}
+  }});
+}}
+
+function renderBar(tokenId) {{
+  const tok = DATA.tokens.find(t => String(t.id) === String(tokenId));
+  const name = tok ? tok.token : `(id=${{tokenId}})`;
+
+  const per = DATA.per_file[String(tokenId)] || {{}};
+  const xs = DATA.files.slice();
+  const ys = xs.map(fn => (per[fn] || 0));
+
+  Plotly.newPlot("barDiv", [{{
+    type: "bar",
+    x: xs,
+    y: ys
+  }}], {{
+    margin: {{l: 50, r: 10, t: 30, b: 120}},
+    xaxis: {{ tickangle: 35, automargin: true }},
+    yaxis: {{ title: "Count" }},
+    title: `Token: ${{name}} (id=${{tokenId}})`
+  }}, {{displayModeBar: true}});
+}}
+
+function renderHeatmap() {{
+  const h = DATA.heatmap;
+  const div = document.getElementById("heatDiv");
+
+  if (!h) {{
+    Plotly.newPlot(div, [], {{
+      margin: {{l: 20, r: 10, t: 30, b: 30}},
+      title: "Similarity heatmap: not computed",
+      annotations: [{{
+        text: "No heatmap payload present.",
+        xref: "paper", yref: "paper", x: 0.5, y: 0.5, showarrow: false
+      }}]
+    }}, {{displayModeBar: false}});
+    return;
+  }}
+
+  if (!h.ok) {{
+    Plotly.newPlot(div, [], {{
+      margin: {{l: 20, r: 10, t: 30, b: 30}},
+      title: "Similarity heatmap: unavailable",
+      annotations: [{{
+        text: h.reason || "Unavailable",
+        xref: "paper", yref: "paper", x: 0.5, y: 0.5, showarrow: false
+      }}]
+    }}, {{displayModeBar: false}});
+    return;
+  }}
+
+  Plotly.newPlot(div, h.traces, h.layout, {{displayModeBar: true}});
+}}
+
+function selectToken(tokenId, updateSelect) {{
+  if (updateSelect) {{
+    document.getElementById("tokenSelect").value = String(tokenId);
+  }}
+  renderBar(tokenId);
+}}
+
+function init() {{
+  buildSelectOptions(DATA.tokens);
+
+  const sel = document.getElementById("tokenSelect");
+  sel.value = String(DATA.default_token_id);
+
+  renderTable(DATA.tokens);
+  renderBar(DATA.default_token_id);
+  renderHeatmap();
+
+  sel.addEventListener("change", (e) => {{
+    selectToken(e.target.value, false);
+  }});
+
+  const search = document.getElementById("searchBox");
+  search.addEventListener("input", (e) => {{
+    const q = e.target.value || "";
+    const filtered = filterTokens(DATA.tokens, q);
+
+    const cur = document.getElementById("tokenSelect").value;
+    buildSelectOptions(filtered);
+
+    const hasCur = filtered.some(t => String(t.id) === String(cur));
+    const newId = hasCur ? cur : (filtered.length ? String(filtered[0].id) : String(DATA.default_token_id));
+    document.getElementById("tokenSelect").value = newId;
+
+    renderTable(filtered);
+    selectToken(newId, false);
+  }});
+}}
+
+init();
+</script>
+</body>
+</html>
+"""
+    out_path.write_text(html, encoding="utf-8")
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--vocab", default="trained_spm_model.vocab",
+                    help="SentencePiece vocab file (default: trained_spm_model.vocab). Used to infer .model if --model not given.")
+    ap.add_argument("--model", default=None,
+                    help="SentencePiece model file (.model). If omitted, inferred from --vocab by replacing .vocab -> .model.")
+    ap.add_argument("--dir", required=True,
+                    help="Directory of text files to scan.")
+    ap.add_argument("--recursive", action="store_true",
+                    help="Recurse into subdirectories (default: false).")
+    ap.add_argument("--suffixes", default=".txt",
+                    help="Comma-separated suffixes to include (default: .txt). Example: .txt,.md")
+
+    ap.add_argument("--top-k", type=int, default=1500,
+                    help="Embed only top-K tokens by total frequency into the HTML (default: 1500).")
+    ap.add_argument("--min-count", type=int, default=1,
+                    help="Only consider tokens with total count >= this (default: 1).")
+    ap.add_argument("--out", default="vocab_freq_dashboard.html",
+                    help="Output HTML path (default: vocab_freq_dashboard.html)")
+
+    # NEW: heatmap controls (no dendro mode; just heatmap)
+    ap.add_argument("--heatmap", action="store_true",
+                    help="Compute and embed a file similarity heatmap (requires numpy).")
+    ap.add_argument("--heatmap-top-k", type=int, default=300,
+                    help="Use top-K frequent tokens (from the directory) as features for TF-IDF similarity (default: 300).")
+    ap.add_argument("--heatmap-reorder", action="store_true",
+                    help="Reorder files to group similar ones (simple greedy heuristic, no SciPy).")
+
+    args = ap.parse_args()
+
+    vocab_path = Path(args.vocab)
+    model_path = Path(args.model) if args.model else infer_model_path_from_vocab(vocab_path)
+    root = Path(args.dir)
+    out = Path(args.out)
+
+    if not model_path.exists():
+        raise SystemExit(f"SentencePiece model not found: {model_path} (pass --model or ensure it matches --vocab)")
+    if not root.exists() or not root.is_dir():
+        raise SystemExit(f"Directory not found: {root}")
+
+    suffixes = tuple(s.strip().lower() for s in args.suffixes.split(",") if s.strip())
+    files = iter_text_files(root, recursive=args.recursive, suffixes=suffixes)
+    if not files:
+        raise SystemExit(f"No files found in {root} with suffixes={suffixes} (try --recursive or --suffixes)")
+
+    sp = spm.SentencePieceProcessor(model_file=str(model_path))
+
+    print(f"[info] model: {model_path}")
+    print(f"[info] vocab size: {sp.get_piece_size()}")
+    print(f"[info] scanning {len(files)} files under: {root}")
+
+    total = Counter()
+    per_file: Dict[str, Counter] = {}
+    file_names: List[str] = []
+
+    for p in files:
+        rel = str(p.relative_to(root))
+        file_names.append(rel)
+        c = count_tokens_in_file(sp, p)
+        per_file[rel] = c
+        total.update(c)
+
+    # Token UI: top tokens by directory frequency
+    items = [(tid, cnt) for tid, cnt in total.items() if cnt >= args.min_count]
+    items.sort(key=lambda x: x[1], reverse=True)
+    if not items:
+        raise SystemExit(f"No tokens met min-count={args.min_count} (unexpected).")
+
+    top_items = items[: max(1, args.top_k)]
+
+    token_rows: List[Dict] = []
+    per_file_counts: Dict[str, Dict[str, int]] = {}
+
+    for tid, cnt in top_items:
+        tok = human_token(sp.id_to_piece(int(tid)))
+        token_rows.append({"id": int(tid), "token": tok, "count": int(cnt)})
+
+    for tid, _ in top_items:
+        tid = int(tid)
+        k = str(tid)
+        per_file_counts[k] = {}
+        for fn in file_names:
+            v = per_file[fn].get(tid, 0)
+            if v:
+                per_file_counts[k][fn] = int(v)
+
+    default_token_id = int(top_items[0][0])
+
+    heatmap_payload: Optional[Dict] = None
+    if args.heatmap:
+        # Features for similarity
+        feat_tok_ids = [int(tid) for tid, _ in items[: max(2, args.heatmap_top_k)]]
+        print(f"[info] heatmap features: top {len(feat_tok_ids)} tokens (TF-IDF)")
+        try:
+            heatmap_payload = _build_heatmap_payload(
+                file_names=file_names,
+                per_file=per_file,
+                token_ids_for_heatmap=feat_tok_ids,
+                reorder=args.heatmap_reorder,
+            )
+        except Exception as e:
+            heatmap_payload = {"ok": False, "reason": f"Failed to build heatmap: {e!r}"}
+            print(f"[warn] heatmap failed: {e!r}")
+
+    title = f"SentencePiece token frequency dashboard ({root.name})"
+    build_html(
+        title=title,
+        token_rows=token_rows,
+        per_file_counts=per_file_counts,
+        file_order=file_names,
+        default_token_id=default_token_id,
+        heatmap_payload=heatmap_payload,
+        out_path=out,
+    )
+
+    print(f"[done] wrote: {out}")
+    print("[note] Uses Plotly CDN for interactivity; open the HTML in your browser.")
+    if args.heatmap and (np is None):
+        print("[note] Install heatmap deps: python3 -m pip install numpy")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/data/flores200-res/tokenization_vs_origina.sh b/data/flores200-res/tokenization_vs_origina.sh
new file mode 100644
index 0000000000..03b7ab75ac
--- /dev/null
+++ b/data/flores200-res/tokenization_vs_origina.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+python3 plot_tokenization_vs_original.py \
+  --json filtered_files.json \
+  --mode ratio \
+  --method tiktoken \
+  --group-by family --color-by family \
+  --out ratio_family.png
+
+python3 plot_tokenization_vs_original.py \
+  --json filtered_files.json \
+  --mode tokenized_kb --method tiktoken \
+  --group-by region --color-by region \
+  --out tok_kb_region.png
+
+python3 plot_tokenization_vs_original.py \
+  --json filtered_files.json \
+  --mode ratio --method tiktoken \
+  --group-by script --color-by script \
+  --out ratio_script.png
+
diff --git a/data/flores200-res/tokenize.sh b/data/flores200-res/tokenize.sh
new file mode 100644
index 0000000000..634bf10554
--- /dev/null
+++ b/data/flores200-res/tokenize.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+python3 tokenize_and_annotate_sizes.py \
+  --in-json filtered_files.json \
+  --method tiktoken \
+  --tiktoken-encoding gpt2
+
diff --git a/data/flores200-res/tokenize_and_annotate_sizes.py b/data/flores200-res/tokenize_and_annotate_sizes.py
new file mode 100644
index 0000000000..e39d834899
--- /dev/null
+++ b/data/flores200-res/tokenize_and_annotate_sizes.py
@@ -0,0 +1,254 @@
+#!/usr/bin/env python3
+"""
+tokenize_and_annotate_sizes.py
+
+Reads the *filtered* JSON produced by filter_files_by_script.py (one entry per file),
+runs prepare.py (assumed symlinked as ./prepare.py) to tokenize each text file with
+100% train split (no val), writes:
+
+  text_<lang>_<script>_<tokenization_type>.bin
+
+and then appends/updates:
+
+  entry["tokenized_sizes"][<tokenization_type>] = <size_kb_of_bin>
+
+Example output entry:
+{
+  "language": "ace",
+  "script": "Latn",
+  "lang_script": "ace_Latn",
+  "size_kb": 277.0,
+  "tokenized_sizes": {"tiktoken": 300.0},
+  "filename": "text_ace_Latn.txt"
+}
+
+Notes:
+- prepare.py writes meta.pkl in the current directory and overwrites it each run.
+- This script runs tokenization sequentially to avoid meta.pkl races.
+- Uses --percentage_train 1.0 to ensure 100% of the file is tokenized and no val is written.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, List
+
+
+def file_size_kb(path: Path) -> float:
+    return path.stat().st_size / 1024.0
+
+
+def run_prepare(
+    prepare_path: Path,
+    input_txt: Path,
+    out_bin: Path,
+    method: str,
+    tiktoken_encoding: str,
+    additional_tokens_file: str | None,
+    extra_args: List[str],
+) -> None:
+    cmd = [
+        "python3",
+        str(prepare_path),
+        "--method",
+        method,
+        "-t",
+        str(input_txt),
+        "--train_output",
+        str(out_bin),
+        "--percentage_train",
+        "1.0",
+    ]
+
+    # Only pass tiktoken args when relevant
+    if method == "tiktoken":
+        cmd += ["--tiktoken_encoding", tiktoken_encoding]
+        if additional_tokens_file:
+            cmd += ["--additional_tokens_file", additional_tokens_file]
+
+    # Allow power-users to append any extra prepare.py args
+    cmd += extra_args
+
+    subprocess.run(cmd, check=True)
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+
+    ap.add_argument(
+        "--in-json",
+        default="filtered_files.json",
+        help="Input JSON from filter_files_by_script.py",
+    )
+    ap.add_argument(
+        "--out-json",
+        default=None,
+        help="Output JSON (default: overwrite --in-json)",
+    )
+    ap.add_argument(
+        "--prepare",
+        default="./prepare.py",
+        help="Path to prepare.py (symlink in cwd is fine)",
+    )
+    ap.add_argument(
+        "--base-dir",
+        default=".",
+        help="Directory where the text_*.txt files live (default: cwd)",
+    )
+
+    # Tokenization selection (start with tiktoken, but allow switching)
+    ap.add_argument(
+        "--method",
+        choices=[
+            "tiktoken",
+            "sentencepiece",
+            "char",
+            "custom",
+            "byte",
+            "custom_char_byte_fallback",
+            "json_byte_fallback",
+            "python_programming",
+            "sinewave",
+        ],
+        default="tiktoken",
+        help="Tokenizer method to run via prepare.py",
+    )
+
+    # tiktoken-specific knobs (ignored for other methods)
+    ap.add_argument(
+        "--tiktoken-encoding",
+        choices=["gpt2", "r50k_base", "p50k_base", "cl100k_base"],
+        default="gpt2",
+        help="tiktoken encoding (only used if --method tiktoken)",
+    )
+    ap.add_argument(
+        "--additional-tokens-file",
+        default=None,
+        help="JSON file of additional special tokens for tiktoken (only used if --method tiktoken)",
+    )
+
+    ap.add_argument(
+        "--force",
+        action="store_true",
+        help="Re-run tokenization even if the output .bin already exists",
+    )
+    ap.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print what would run, but do not execute prepare.py or write json",
+    )
+
+    # Pass-through to prepare.py (optional)
+    ap.add_argument(
+        "--prepare-extra-args",
+        nargs=argparse.REMAINDER,
+        default=[],
+        help="Everything after this flag is passed to prepare.py verbatim. "
+             "Example: --prepare-extra-args -T",
+    )
+
+    args = ap.parse_args()
+
+    in_json = Path(args.in_json)
+    out_json = Path(args.out_json) if args.out_json else in_json
+    prepare_path = Path(args.prepare)
+    base_dir = Path(args.base_dir)
+
+    if not prepare_path.exists():
+        raise SystemExit(f"prepare.py not found at: {prepare_path}")
+    if not in_json.exists():
+        raise SystemExit(f"Input JSON not found: {in_json}")
+
+    rows: List[Dict[str, Any]]
+    with in_json.open("r", encoding="utf-8") as f:
+        rows = json.load(f)
+    if not isinstance(rows, list):
+        raise SystemExit("Expected input JSON to be a list of objects")
+
+    method = args.method
+
+    for entry in rows:
+        # expected from your filtered file:
+        # language, script, lang_script, size_kb, filename (optional)
+        filename = entry.get("filename")
+        if not filename:
+            # If filename was dropped, reconstruct from lang/script
+            lang = entry["language"]
+            script = entry["script"]
+            filename = f"text_{lang}_{script}.txt"
+
+        input_txt = base_dir / filename
+        if not input_txt.exists():
+            # skip missing files (common if base_dir wrong)
+            print(f"[skip] missing input: {input_txt}")
+            continue
+
+        # Output: text_<lang>_<script>_<method>.bin
+        lang = entry["language"]
+        script = entry["script"]
+        out_bin = base_dir / f"text_{lang}_{script}_{method}.bin"
+
+        # Ensure tokenized_sizes map exists
+        tok_sizes = entry.get("tokenized_sizes")
+        if not isinstance(tok_sizes, dict):
+            tok_sizes = {}
+            entry["tokenized_sizes"] = tok_sizes
+
+        if out_bin.exists() and not args.force:
+            # already computed? still record size in json
+            kb = file_size_kb(out_bin)
+            tok_sizes[method] = kb
+            print(f"[reuse] {out_bin.name} {kb:.1f} KB")
+            continue
+
+        cmd_preview = f"python3 {prepare_path} --method {method} -t {input_txt} --train_output {out_bin} --percentage_train 1.0"
+        if method == "tiktoken":
+            cmd_preview += f" --tiktoken_encoding {args.tiktoken_encoding}"
+            if args.additional_tokens_file:
+                cmd_preview += f" --additional_tokens_file {args.additional_tokens_file}"
+        if args.prepare_extra_args:
+            cmd_preview += " " + " ".join(args.prepare_extra_args)
+
+        if args.dry_run:
+            print(f"[dry-run] {cmd_preview}")
+            continue
+
+        print(f"[run] {cmd_preview}")
+        try:
+            run_prepare(
+                prepare_path=prepare_path,
+                input_txt=input_txt,
+                out_bin=out_bin,
+                method=method,
+                tiktoken_encoding=args.tiktoken_encoding,
+                additional_tokens_file=args.additional_tokens_file,
+                extra_args=args.prepare_extra_args,
+            )
+        except subprocess.CalledProcessError as e:
+            print(f"[error] tokenization failed for {input_txt.name} ({method}): {e}")
+            continue
+
+        if not out_bin.exists():
+            print(f"[error] expected output missing: {out_bin}")
+            continue
+
+        kb = file_size_kb(out_bin)
+        tok_sizes[method] = kb
+        print(f"[ok] {out_bin.name} {kb:.1f} KB")
+
+    if args.dry_run:
+        return
+
+    # Write updated JSON
+    with out_json.open("w", encoding="utf-8") as f:
+        json.dump(rows, f, indent=2, ensure_ascii=False)
+    print(f"[done] wrote updated json: {out_json}")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/data/flores200-res/zh_stats.json b/data/flores200-res/zh_stats.json
new file mode 100644
index 0000000000..eda44595ef
--- /dev/null
+++ b/data/flores200-res/zh_stats.json
@@ -0,0 +1,7 @@
+{
+  "transcribed_bytes": 208747,
+  "not_transcribed_bytes": 27209,
+  "total_bytes": 235956,
+  "pct_transcribed": 88.4686127922155,
+  "pct_not_transcribed": 11.531387207784501
+}
\ No newline at end of file
diff --git a/data/template/utils/en2ipa.py b/data/template/utils/en2ipa.py
index c6b9c84973..14708f1cc7 100644
--- a/data/template/utils/en2ipa.py
+++ b/data/template/utils/en2ipa.py
@@ -1,11 +1,11 @@
+#!/usr/bin/env python3
 # data/template/utils/en2ipa.py
 
 import subprocess
-from konlpy.tag import Okt
 import argparse
 import re
 import json
-from typing import List, Tuple
+from typing import List, Tuple, Optional, Dict, Any
 from rich.progress import Progress, BarColumn, TextColumn, TimeRemainingColumn, TimeElapsedColumn, MofNCompleteColumn
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import os
@@ -14,6 +14,21 @@
 counter = 0
 counter_lock = threading.Lock()
 
+WRAP_PREFIX = "[[[[["
+WRAP_SUFFIX = "]]]]]"
+
+_WORD_RE = re.compile(r'\w+|[^\w\s]', re.UNICODE)
+
+
+def utf8_len(s: str) -> int:
+    return len(s.encode("utf-8"))
+
+
+def is_english_token(tok: str) -> bool:
+    # Matches your original intent: “contains any a-z letter”
+    return any('a' <= ch.lower() <= 'z' for ch in tok)
+
+
 def transcribe_english(sentence, wrapper=False):
     """Transcribe an English sentence into its phonemes using espeak."""
     try:
@@ -24,28 +39,28 @@ def transcribe_english(sentence, wrapper=False):
         )
         transcription = result.stdout.strip().replace("ㆍ", " ")
         if "(en)" in transcription:
-            return f"[[[[[{sentence}]]]]]" if wrapper else sentence
+            return f"{WRAP_PREFIX}{sentence}{WRAP_SUFFIX}" if wrapper else sentence
         return transcription
     except Exception as e:
         return f"Error in transcribing English: {str(e)}"
 
+
 def handle_mixed_language(word, wrapper=False):
     """Handle a word with potential English, Language, or number content."""
     global counter
     if word.isdigit():
         return word
-    elif any('a' <= char.lower() <= 'z' for char in word):
+    elif is_english_token(word):
         return transcribe_english(word, wrapper=wrapper)
     else:
         if wrapper:
-            return "[[[[[" + word + "]]]]]"
+            return f"{WRAP_PREFIX}{word}{WRAP_SUFFIX}"
         else:
-            # thread-safe increment
+            # thread-safe increment (your existing stat)
             with counter_lock:
                 counter += 1
             return word
 
-_WORD_RE = re.compile(r'\w+|[^\w\s]', re.UNICODE)
 
 def transcribe_tokens_to_string(tokens: List[str], wrapper: bool) -> str:
     result = []
@@ -56,11 +71,33 @@ def transcribe_tokens_to_string(tokens: List[str], wrapper: bool) -> str:
             result.append(tok)
     return " ".join(result)
 
-def _worker_sentence(sentence: str, wrapper: bool) -> str:
-    """Worker function: tokenize and transcribe one sentence/line."""
+
+def _worker_sentence(sentence: str, wrapper: bool, stats: Optional[Dict[str, int]] = None) -> str:
+    """
+    Worker function: tokenize and transcribe one sentence/line.
+    If stats is provided, updates:
+      - transcribed_bytes: UTF-8 bytes of ORIGINAL tokens that were transcribed (English tokens)
+      - not_transcribed_bytes: UTF-8 bytes of ORIGINAL tokens not transcribed (digits, punctuation, non-English words)
+    Counts are based on ORIGINAL tokens, so wrapper overhead is excluded automatically.
+    """
     tokens = _WORD_RE.findall(sentence)
+
+    if stats is not None:
+        for tok in tokens:
+            b = utf8_len(tok)
+            if re.match(r'\w+', tok):
+                if tok.isdigit():
+                    stats["not_transcribed_bytes"] += b
+                elif is_english_token(tok):
+                    stats["transcribed_bytes"] += b
+                else:
+                    stats["not_transcribed_bytes"] += b
+            else:
+                stats["not_transcribed_bytes"] += b
+
     return transcribe_tokens_to_string(tokens, wrapper=wrapper)
 
+
 def _progress() -> Progress:
     return Progress(
         TextColumn("[bold blue]{task.description}"),
@@ -72,8 +109,16 @@ def _progress() -> Progress:
         transient=False,
     )
 
-def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa', wrapper=False,
-                            multithread: bool = False, workers: int = 0):
+
+def transcribe_multilingual(
+    sentences,
+    input_json_key=None,
+    output_json_key='ipa',
+    wrapper=False,
+    multithread: bool = False,
+    workers: int = 0,
+    stats: Optional[Dict[str, int]] = None,
+):
     """Transcribe multilingual sentences (JSON list mode)."""
     try:
         data = json.loads(sentences) if isinstance(sentences, str) else sentences
@@ -84,6 +129,12 @@ def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa
         if n == 0:
             return json.dumps(data, ensure_ascii=False, indent=4)
 
+        if stats is None:
+            stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+        else:
+            stats.setdefault("transcribed_bytes", 0)
+            stats.setdefault("not_transcribed_bytes", 0)
+
         if not multithread or workers <= 1:
             # Single-threaded path (original behavior)
             with _progress() as progress:
@@ -91,24 +142,35 @@ def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa
                 for item in data:
                     if input_json_key in item:
                         sentence = item[input_json_key]
-                        item[output_json_key] = _worker_sentence(sentence, wrapper)
+                        item[output_json_key] = _worker_sentence(sentence, wrapper, stats=stats)
                     progress.update(task, advance=1)
         else:
             # Multithreaded path with ordered assembly
             results: List[Tuple[int, str]] = [None] * n  # type: ignore
+
             # prepare jobs
             jobs = []
             for idx, item in enumerate(data):
                 sentence = item.get(input_json_key, "")
                 jobs.append((idx, sentence))
 
+            # Per-thread stats to avoid locks in hot path; merge at end
+            per_thread_stats: List[Dict[str, int]] = []
+
+            def submit_job(ex, idx_sentence):
+                idx, sentence = idx_sentence
+                local_stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+                per_thread_stats.append(local_stats)
+                return ex.submit(_worker_sentence, sentence, wrapper, local_stats), idx
+
             with _progress() as progress:
                 task = progress.add_task(f"Processing JSON items (mt x{workers})", total=n)
                 with ThreadPoolExecutor(max_workers=workers) as ex:
-                    future_to_idx = {
-                        ex.submit(_worker_sentence, sentence, wrapper): idx
-                        for idx, sentence in jobs
-                    }
+                    future_to_idx = {}
+                    for idx_sentence in jobs:
+                        fut, idx = submit_job(ex, idx_sentence)
+                        future_to_idx[fut] = idx
+
                     for fut in as_completed(future_to_idx):
                         idx = future_to_idx[fut]
                         try:
@@ -118,6 +180,11 @@ def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa
                         results[idx] = (idx, res)
                         progress.update(task, advance=1)
 
+            # merge per-thread stats
+            for st in per_thread_stats:
+                stats["transcribed_bytes"] += st.get("transcribed_bytes", 0)
+                stats["not_transcribed_bytes"] += st.get("not_transcribed_bytes", 0)
+
             # write back in original order
             for idx, item in enumerate(data):
                 if input_json_key in item:
@@ -129,32 +196,50 @@ def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa
 
     return json.dumps(data, ensure_ascii=False, indent=4)
 
-def transcribe_text_lines(lines: List[str], wrapper: bool, multithread: bool = False, workers: int = 0) -> List[str]:
+
+def transcribe_text_lines(
+    lines: List[str],
+    wrapper: bool,
+    multithread: bool = False,
+    workers: int = 0,
+    stats: Optional[Dict[str, int]] = None,
+) -> List[str]:
     """Transcribe a plain-text file line-by-line."""
     n = len(lines)
     if n == 0:
         return []
 
+    if stats is None:
+        stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+    else:
+        stats.setdefault("transcribed_bytes", 0)
+        stats.setdefault("not_transcribed_bytes", 0)
+
     if not multithread or workers <= 1:
-        # Single-threaded
         out_lines: List[str] = []
         with _progress() as progress:
             task = progress.add_task("Processing text lines", total=n)
             for line in lines:
                 raw = line.rstrip("\n")
-                out_lines.append(_worker_sentence(raw, wrapper))
+                out_lines.append(_worker_sentence(raw, wrapper, stats=stats))
                 progress.update(task, advance=1)
         return out_lines
     else:
-        # Multithreaded with ordered assembly
         out_lines: List[str] = [None] * n  # type: ignore
+
+        # Per-thread stats (avoid global lock)
+        per_thread_stats: List[Dict[str, int]] = [None] * n  # type: ignore
+
         with _progress() as progress:
             task = progress.add_task(f"Processing text lines (mt x{workers})", total=n)
             with ThreadPoolExecutor(max_workers=workers) as ex:
-                future_to_idx = {
-                    ex.submit(_worker_sentence, lines[i].rstrip("\n"), wrapper): i
-                    for i in range(n)
-                }
+                future_to_idx = {}
+                for i in range(n):
+                    local_stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+                    per_thread_stats[i] = local_stats
+                    fut = ex.submit(_worker_sentence, lines[i].rstrip("\n"), wrapper, local_stats)
+                    future_to_idx[fut] = i
+
                 for fut in as_completed(future_to_idx):
                     idx = future_to_idx[fut]
                     try:
@@ -162,8 +247,45 @@ def transcribe_text_lines(lines: List[str], wrapper: bool, multithread: bool = F
                     except Exception as e:
                         out_lines[idx] = f"Error: {e}"
                     progress.update(task, advance=1)
+
+        # merge stats
+        for st in per_thread_stats:
+            stats["transcribed_bytes"] += st.get("transcribed_bytes", 0)
+            stats["not_transcribed_bytes"] += st.get("not_transcribed_bytes", 0)
+
         return out_lines
 
+
+def finalize_and_print_stats(stats: Dict[str, int], stats_json_path: Optional[str] = None) -> Dict[str, Any]:
+    transcribed = int(stats.get("transcribed_bytes", 0))
+    not_tx = int(stats.get("not_transcribed_bytes", 0))
+    total = transcribed + not_tx
+    pct_tx = (transcribed / total * 100.0) if total else 0.0
+    pct_not = (not_tx / total * 100.0) if total else 0.0
+
+    out_stats: Dict[str, Any] = {
+        "transcribed_bytes": transcribed,
+        "not_transcribed_bytes": not_tx,
+        "total_bytes": total,
+        "pct_transcribed": pct_tx,
+        "pct_not_transcribed": pct_not,
+    }
+
+    print("\n=== Byte Coverage Stats (based on ORIGINAL tokens) ===")
+    print(f"Transcribed bytes      : {out_stats['transcribed_bytes']}")
+    print(f"Not transcribed bytes  : {out_stats['not_transcribed_bytes']}")
+    print(f"Total bytes (counted)  : {out_stats['total_bytes']}")
+    print(f"% transcribed          : {out_stats['pct_transcribed']:.2f}%")
+    print(f"% not transcribed      : {out_stats['pct_not_transcribed']:.2f}%")
+
+    if stats_json_path:
+        with open(stats_json_path, "w", encoding="utf-8") as sf:
+            json.dump(out_stats, sf, ensure_ascii=False, indent=2)
+        print(f"Stats JSON written to: {stats_json_path}")
+
+    return out_stats
+
+
 def main():
     parser = argparse.ArgumentParser(
         description='Transcribe multilingual content into IPA phonemes. Supports JSON list mode and plain-text line mode.'
@@ -192,12 +314,18 @@ def main():
     parser.add_argument("--workers", type=int, default=os.cpu_count() or 4,
                         help="Number of worker threads when --multithread is enabled (default: CPU count).")
 
+    # NEW: stats output
+    parser.add_argument("--stats_json", type=str, default=None,
+                        help="Optional: write byte coverage stats as JSON to this path (in addition to printing).")
+
     args = parser.parse_args()
 
     # clamp workers
     if args.workers is None or args.workers < 1:
         args.workers = 1
 
+    stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+
     try:
         if args.mode == 'json':
             if not args.input_json_key:
@@ -210,7 +338,8 @@ def main():
                 args.output_json_key,
                 wrapper=args.wrapper,
                 multithread=args.multithread,
-                workers=args.workers
+                workers=args.workers,
+                stats=stats,
             )
             if updated_json_data:
                 with open(args.input_file, 'w', encoding='utf-8') as f:
@@ -223,19 +352,24 @@ def main():
                 lines,
                 wrapper=args.wrapper,
                 multithread=args.multithread,
-                workers=args.workers
+                workers=args.workers,
+                stats=stats,
             )
             target_path = args.output_file if args.output_file else args.input_file
             with open(target_path, 'w', encoding='utf-8') as f:
                 f.write("\n".join(out_lines) + ("\n" if out_lines else ""))
             print(f"✅ Successfully wrote transcribed text to '{target_path}'")
 
-        print(f"📊 Stats: {counter} unparseable words")
+        finalize_and_print_stats(stats, stats_json_path=args.stats_json)
+
+        print(f"📊 Stats: {counter} unparseable words (only counted when --no-wrapper)")
+
     except FileNotFoundError:
         print(f"Error: Input file '{args.input_file}' not found.")
     except ValueError as ve:
         print(f"Error: {ve}")
 
+
 if __name__ == '__main__':
     main()
 
diff --git a/data/template/utils/espeak2ipa.py b/data/template/utils/espeak2ipa.py
new file mode 100644
index 0000000000..0e017747fa
--- /dev/null
+++ b/data/template/utils/espeak2ipa.py
@@ -0,0 +1,454 @@
+#!/usr/bin/env python3
+# espeak2ipa.py
+#
+# Generic IPA transcription using espeak-ng for ANY supported voice.
+# Defaults to "shn" (override with --lang).
+#
+# Features:
+# - JSON list mode (--mode json):
+#     - default: overwrite input JSON file adding output_json_key per item
+#     - with --text_output: emit a text file (sentence<sep>ipa OR ipa-only via --text_no_sentence)
+# - Text mode (--mode text): input is one sentence per line
+#     - default: emits IPA-only (backward-compatible with your existing espeak2ipa.py)
+#     - with --text_output: emits sentence<sep>ipa (JP-like), unless --text_no_sentence
+# - Optional wrapping for untranscribed/unparseable tokens: [[[[[...]]]]]
+# - Multithreading with ordered output
+# - Rich progress bar
+# - Byte coverage stats (based on ORIGINAL tokens; wrapper overhead excluded)
+#
+# Notes:
+# - "transcribed_bytes" counts UTF-8 bytes of ORIGINAL tokens we ATTEMPT to send to espeak
+#   (tokens that contain at least one Unicode letter). Digits/punct count as not_transcribed.
+# - If espeak-ng outputs empty text for a token, we treat it as "unparseable" and optionally wrap it.
+
+import subprocess
+import argparse
+import re
+import json
+from typing import List, Optional, Dict, Any, Tuple
+from rich.progress import (
+    Progress,
+    BarColumn,
+    TextColumn,
+    TimeRemainingColumn,
+    TimeElapsedColumn,
+    MofNCompleteColumn,
+)
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import os
+import threading
+
+
+WRAP_PREFIX = "[[[[["
+WRAP_SUFFIX = "]]]]]"
+_WORD_RE = re.compile(r"\w+|[^\w\s]", re.UNICODE)
+
+counter_unparseable = 0
+counter_lock = threading.Lock()
+
+
+def utf8_len(s: str) -> int:
+    return len(s.encode("utf-8"))
+
+
+def token_has_letter(tok: str) -> bool:
+    # "letter" across scripts (Latin, Han, Kana, Arabic, etc.)
+    return any(ch.isalpha() for ch in tok)
+
+
+def transcribe_espeak(token: str, lang: str, wrapper: bool = False) -> str:
+    """
+    Transcribe a token via espeak-ng.
+    If transcription fails (empty output / exception), return wrapped or original token.
+    """
+    global counter_unparseable
+    try:
+        result = subprocess.run(
+            ["espeak-ng", "-q", "-v", lang, "--ipa", token],
+            capture_output=True,
+            text=True,
+        )
+        out = (result.stdout or "").strip().replace("ㆍ", " ")
+        if not out:
+            if wrapper:
+                return f"{WRAP_PREFIX}{token}{WRAP_SUFFIX}"
+            with counter_lock:
+                counter_unparseable += 1
+            return token
+        return out
+    except Exception:
+        if wrapper:
+            return f"{WRAP_PREFIX}{token}{WRAP_SUFFIX}"
+        with counter_lock:
+            counter_unparseable += 1
+        return token
+
+
+def handle_token(tok: str, lang: str, wrapper: bool) -> str:
+    """
+    Decide whether to transcribe:
+      - digits -> passthrough
+      - tokens with any letter -> transcribe via espeak
+      - otherwise (punct/symbol) -> passthrough
+    """
+    if tok.isdigit():
+        return tok
+    if token_has_letter(tok):
+        return transcribe_espeak(tok, lang=lang, wrapper=wrapper)
+    return tok
+
+
+def tokens_to_ipa_string(tokens: List[str], lang: str, wrapper: bool) -> str:
+    out: List[str] = []
+    for tok in tokens:
+        if re.match(r"\w+", tok):
+            out.append(handle_token(tok, lang=lang, wrapper=wrapper))
+        else:
+            out.append(tok)
+    return " ".join(out)
+
+
+def _worker_sentence(
+    sentence: str,
+    lang: str,
+    wrapper: bool,
+    stats: Optional[Dict[str, int]] = None,
+) -> str:
+    """
+    Tokenize & transcribe one sentence/line.
+
+    If stats is provided, updates byte counts based on ORIGINAL tokens:
+      - transcribed_bytes: tokens containing at least one letter
+      - not_transcribed_bytes: digits + punctuation/symbols + other \\w tokens with no letters
+    """
+    tokens = _WORD_RE.findall(sentence)
+
+    if stats is not None:
+        for tok in tokens:
+            b = utf8_len(tok)
+            if re.match(r"\w+", tok):
+                if tok.isdigit():
+                    stats["not_transcribed_bytes"] += b
+                elif token_has_letter(tok):
+                    stats["transcribed_bytes"] += b
+                else:
+                    stats["not_transcribed_bytes"] += b
+            else:
+                stats["not_transcribed_bytes"] += b
+
+    return tokens_to_ipa_string(tokens, lang=lang, wrapper=wrapper)
+
+
+def _progress() -> Progress:
+    return Progress(
+        TextColumn("[bold blue]{task.description}"),
+        BarColumn(),
+        MofNCompleteColumn(),
+        TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+        TimeElapsedColumn(),
+        TimeRemainingColumn(),
+        transient=False,
+    )
+
+
+def transcribe_sentences(
+    sentences: List[str],
+    lang: str,
+    wrapper: bool,
+    multithread: bool,
+    workers: int,
+    stats: Optional[Dict[str, int]] = None,
+    progress_label: str = "Processing",
+) -> List[str]:
+    """
+    Transcribe a list of sentences into IPA, returning results in the same order.
+    """
+    n = len(sentences)
+    if stats is None:
+        stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+    else:
+        stats.setdefault("transcribed_bytes", 0)
+        stats.setdefault("not_transcribed_bytes", 0)
+
+    if n == 0:
+        return []
+
+    if not multithread or workers <= 1:
+        out: List[str] = []
+        with _progress() as progress:
+            task = progress.add_task(progress_label, total=n)
+            for s in sentences:
+                out.append(_worker_sentence(s, lang=lang, wrapper=wrapper, stats=stats))
+                progress.update(task, advance=1)
+        return out
+
+    # Multithreaded path: per-item stats then merge at end
+    out: List[str] = ["" for _ in range(n)]
+    per_item_stats: List[Dict[str, int]] = [None] * n  # type: ignore
+
+    with _progress() as progress:
+        task = progress.add_task(f"{progress_label} (mt x{workers})", total=n)
+        with ThreadPoolExecutor(max_workers=workers) as ex:
+            future_to_idx = {}
+            for i, s in enumerate(sentences):
+                local_stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+                per_item_stats[i] = local_stats
+                fut = ex.submit(_worker_sentence, s, lang, wrapper, local_stats)
+                future_to_idx[fut] = i
+
+            for fut in as_completed(future_to_idx):
+                i = future_to_idx[fut]
+                try:
+                    out[i] = fut.result()
+                except Exception as e:
+                    out[i] = f"Error: {e}"
+                progress.update(task, advance=1)
+
+    for st in per_item_stats:
+        stats["transcribed_bytes"] += st.get("transcribed_bytes", 0)
+        stats["not_transcribed_bytes"] += st.get("not_transcribed_bytes", 0)
+
+    return out
+
+
+def format_text_lines(
+    sentences: List[str],
+    ipa_lines: List[str],
+    include_sentence: bool,
+    sep: str,
+) -> List[str]:
+    if not include_sentence:
+        return ipa_lines
+    return [f"{s}{sep}{ipa}" for s, ipa in zip(sentences, ipa_lines)]
+
+
+def finalize_and_print_stats(stats: Dict[str, int], stats_json_path: Optional[str] = None) -> Dict[str, Any]:
+    transcribed = int(stats.get("transcribed_bytes", 0))
+    not_tx = int(stats.get("not_transcribed_bytes", 0))
+    total = transcribed + not_tx
+    pct_tx = (transcribed / total * 100.0) if total else 0.0
+    pct_not = (not_tx / total * 100.0) if total else 0.0
+
+    out_stats: Dict[str, Any] = {
+        "transcribed_bytes": transcribed,
+        "not_transcribed_bytes": not_tx,
+        "total_bytes": total,
+        "pct_transcribed": pct_tx,
+        "pct_not_transcribed": pct_not,
+        "unparseable_tokens": counter_unparseable,
+    }
+
+    print("\n=== Byte Coverage Stats (based on ORIGINAL tokens) ===")
+    print(f"Transcribed bytes      : {out_stats['transcribed_bytes']}")
+    print(f"Not transcribed bytes  : {out_stats['not_transcribed_bytes']}")
+    print(f"Total bytes (counted)  : {out_stats['total_bytes']}")
+    print(f"% transcribed          : {out_stats['pct_transcribed']:.2f}%")
+    print(f"% not transcribed      : {out_stats['pct_not_transcribed']:.2f}%")
+    print(f"Unparseable tokens     : {out_stats['unparseable_tokens']}")
+
+    if stats_json_path:
+        with open(stats_json_path, "w", encoding="utf-8") as sf:
+            json.dump(out_stats, sf, ensure_ascii=False, indent=2)
+        print(f"Stats JSON written to: {stats_json_path}")
+
+    return out_stats
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=(
+            "Generic IPA transcription using espeak-ng for any supported voice (default: shn). "
+            "Supports JSON list mode and plain-text line mode, with byte coverage stats.\n\n"
+            "NEW: --text_output and --text_no_sentence (JP-style) to optionally emit only IPA."
+        )
+    )
+    parser.add_argument("input_file", type=str, help="Path to the input file (JSON list or plain text).")
+
+    # Language / voice
+    parser.add_argument(
+        "--lang",
+        default="shn",
+        help="espeak-ng voice/language code (default: shn). Example: en, fr, de, es, ja, zh, etc.",
+    )
+
+    # Mode selection
+    parser.add_argument(
+        "--mode",
+        choices=["json", "text"],
+        default="json",
+        help='Processing mode. "json" expects a JSON list; "text" treats file as plain text.',
+    )
+
+    # JSON mode params
+    parser.add_argument(
+        "--input_json_key",
+        type=str,
+        help="JSON key to read sentences from (required for --mode json).",
+    )
+    parser.add_argument(
+        "--output_json_key",
+        type=str,
+        default="ipa",
+        help='JSON key to store IPA (default: "ipa").',
+    )
+
+    # Output path (used for text outputs; in JSON update mode we overwrite input_file)
+    parser.add_argument(
+        "--output_file",
+        type=str,
+        default=None,
+        help="Output file path for text outputs. In --mode text, defaults to overwriting input.",
+    )
+
+    # Wrapper option
+    parser.add_argument(
+        "--wrapper",
+        default=False,
+        action=argparse.BooleanOptionalAction,
+        help="Wrap unparseable tokens with [[[[[...]]]]] (default: false).",
+    )
+
+    # Multithreading options
+    parser.add_argument(
+        "--multithread",
+        default=False,
+        action=argparse.BooleanOptionalAction,
+        help="Enable multithreading while preserving output order.",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=os.cpu_count() or 4,
+        help="Number of worker threads when --multithread is enabled (default: CPU count).",
+    )
+
+    # Stats output
+    parser.add_argument(
+        "--stats_json",
+        type=str,
+        default=None,
+        help="Optional: write byte coverage stats as JSON to this path (in addition to printing).",
+    )
+
+    # NEW: JP-style text emission controls
+    parser.add_argument(
+        "--text_output",
+        action="store_true",
+        help=(
+            "Emit text output lines instead of JSON update in --mode json. "
+            'In --mode text, when set, emit "sentence<TAB>ipa" lines (unless --text_no_sentence).'
+        ),
+    )
+    parser.add_argument(
+        "--text_no_sentence",
+        action="store_true",
+        help="In text output mode, emit only the IPA (omit the original sentence).",
+    )
+    parser.add_argument(
+        "--text_sep",
+        default="\t",
+        help='Separator used between sentence and IPA in text output mode (default: tab).',
+    )
+
+    args = parser.parse_args()
+
+    # clamp workers
+    if args.workers is None or args.workers < 1:
+        args.workers = 1
+
+    stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+
+    try:
+        if args.mode == "json":
+            if not args.input_json_key:
+                raise ValueError("--input_json_key is required when --mode json")
+
+            with open(args.input_file, "r", encoding="utf-8") as f:
+                data = json.load(f)
+
+            if not isinstance(data, list):
+                raise ValueError("JSON data should be a list of objects.")
+
+            # collect sentences (only items that contain input_json_key)
+            indices: List[int] = []
+            sentences: List[str] = []
+            for i, item in enumerate(data):
+                if isinstance(item, dict) and args.input_json_key in item:
+                    indices.append(i)
+                    sentences.append(str(item[args.input_json_key]))
+
+            ipa_lines = transcribe_sentences(
+                sentences,
+                lang=args.lang,
+                wrapper=args.wrapper,
+                multithread=args.multithread,
+                workers=args.workers,
+                stats=stats,
+                progress_label="Processing JSON items",
+            )
+
+            if args.text_output:
+                include_sentence = not args.text_no_sentence
+                out_lines = format_text_lines(sentences, ipa_lines, include_sentence, args.text_sep)
+
+                target_path = args.output_file
+                if not target_path:
+                    target_path = args.input_file + ".ipa.txt"
+
+                with open(target_path, "w", encoding="utf-8") as f:
+                    f.write("\n".join(out_lines) + ("\n" if out_lines else ""))
+
+                print(f"✅ Successfully wrote text output to '{target_path}'")
+            else:
+                # default behavior: update JSON in-place (overwrite input_file)
+                for idx, ipa in zip(indices, ipa_lines):
+                    data[idx][args.output_json_key] = ipa
+
+                with open(args.input_file, "w", encoding="utf-8") as f:
+                    json.dump(data, f, ensure_ascii=False, indent=4)
+
+                print(f"✅ Successfully updated JSON data in '{args.input_file}'")
+
+        else:
+            # ---- TEXT MODE ----
+            with open(args.input_file, "r", encoding="utf-8") as f:
+                raw_lines = f.readlines()
+
+            sentences = [ln.rstrip("\n") for ln in raw_lines]
+
+            ipa_lines = transcribe_sentences(
+                sentences,
+                lang=args.lang,
+                wrapper=args.wrapper,
+                multithread=args.multithread,
+                workers=args.workers,
+                stats=stats,
+                progress_label="Processing text lines",
+            )
+
+            if args.text_output:
+                include_sentence = not args.text_no_sentence
+                out_lines = format_text_lines(sentences, ipa_lines, include_sentence, args.text_sep)
+            else:
+                # backward-compatible default: IPA-only
+                out_lines = ipa_lines
+
+            target_path = args.output_file if args.output_file else args.input_file
+            with open(target_path, "w", encoding="utf-8") as f:
+                f.write("\n".join(out_lines) + ("\n" if out_lines else ""))
+
+            print(f"✅ Successfully wrote transcribed text to '{target_path}'")
+
+        finalize_and_print_stats(stats, stats_json_path=args.stats_json)
+
+    except FileNotFoundError:
+        print(f"Error: Input file '{args.input_file}' not found.")
+    except ValueError as ve:
+        print(f"Error: {ve}")
+    except json.JSONDecodeError:
+        print(f"Error: Invalid JSON format in '{args.input_file}'.")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/data/template/utils/ja2ipa.py b/data/template/utils/ja2ipa.py
index 513868c490..fb9ea522fd 100644
--- a/data/template/utils/ja2ipa.py
+++ b/data/template/utils/ja2ipa.py
@@ -367,14 +367,6 @@ def hiragana_to_ipa(text: str) -> str:
 
 # ========== 2) MeCab Morphological Tokenization ==========
 def mecab_spaced_reading(text: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
-    """
-    Use MeCab for morphological analysis. Return four strings:
-      1) spaced_original: original surface forms joined by spaces.
-      2) spaced_hira_subbed: token text with the "は" particle overridden to "わ" where applicable, then converted to Hiragana.
-      3) spaced_hira_original: the Hiragana conversion of the original spaced text.
-      4) pos_tags: part-of-speech tags for each token (joined by spaces).
-    If MeCab is not available, return (None, None, None, None).
-    """
     if not MECAB_AVAILABLE:
         return None, None, None, None
 
@@ -389,10 +381,9 @@ def mecab_spaced_reading(text: str) -> Tuple[Optional[str], Optional[str], Optio
         surface = node.surface
         features = node.feature.split(",")
         if len(features) >= 1:
-            pos = features[0]  # e.g. 助詞, 名詞, 動詞...
+            pos = features[0]
             tokens_original.append(surface)
             pos_tokens.append(pos)
-            # Override if particle "は" (助詞)
             if pos == "助詞" and surface == "は":
                 tokens_for_hira.append("わ")
             else:
@@ -415,21 +406,12 @@ def mecab_spaced_reading(text: str) -> Tuple[Optional[str], Optional[str], Optio
 # ========== 3) spaCy Morphological Tokenization ==========
 _spacy_nlp = None
 def load_spacy_japanese():
-    """Lazy-load the spaCy model. Requires 'ja_core_news_sm' or similar to be installed."""
     global _spacy_nlp
     if _spacy_nlp is None:
         _spacy_nlp = spacy.load("ja_core_news_sm")
     return _spacy_nlp
 
 def spacy_spaced_reading(text: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
-    """
-    Use spaCy morphological analysis. Return four strings:
-      1) spaced_original: original token texts joined by spaces.
-      2) spaced_hira_subbed: token texts (with "は" overridden to "わ" when pos_ is ADP) converted to Hiragana.
-      3) spaced_hira_original: Hiragana conversion of the original spaced token texts.
-      4) pos_tags: part-of-speech tags (using token.pos_) joined by spaces.
-    If spaCy is not available, return (None, None, None, None).
-    """
     if not SPACY_AVAILABLE:
         return None, None, None, None
 
@@ -459,7 +441,6 @@ def spacy_spaced_reading(text: str) -> Tuple[Optional[str], Optional[str], Optio
 
 # ========== 4) Unified "get spaced reading" function ==========
 def get_spaced_reading(text: str, method: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
-    """Return (spaced_original, spaced_hira_subbed, spaced_hira_original, pos_tags) using the chosen method."""
     if method == "mecab":
         return mecab_spaced_reading(text)
     elif method == "spacy":
@@ -480,10 +461,6 @@ def write_text_output(
     include_sentence: bool = True,
     sep: str = "\t"
 ) -> None:
-    """
-    Write a plain-text file, one line per entry.
-    Default format: "<sentence>\\t<field>"
-    """
     with open(output_file, "w", encoding="utf-8") as fout:
         for obj in out_array:
             sent = obj.get("sentence", "")
@@ -494,6 +471,10 @@ def write_text_output(
                 fout.write(f"{val}\n")
 
 
+def utf8_len(s: str) -> int:
+    return len(s.encode("utf-8"))
+
+
 # ========== 6) Main Processing Logic ==========
 def process_japanese_text(
     input_file: str,
@@ -505,19 +486,16 @@ def process_japanese_text(
     text_field: str = "spaced_ipa",
     text_include_sentence: bool = True,
     text_sep: str = "\t",
+    stats_json: Optional[str] = None,
 ):
     """
-    Processes Japanese text to IPA.
+    Same behavior as before, plus byte coverage stats:
+      - transcribed_bytes: UTF-8 bytes of ORIGINAL tokens that are considered "transcribed"
+        (anything containing Japanese script: Hiragana/Katakana/Kanji)
+      - not_transcribed_bytes: UTF-8 bytes of ORIGINAL tokens not transcribed (Latin, digits, punctuation)
 
-    INPUT MODES (same defaults as original):
-      - If json_inplace_update=True: treat input as JSON array with "sentence" fields.
-      - Else: treat input as plain text (one sentence per line).
-
-    OUTPUT MODES (new):
-      - Default (unchanged): write JSON array to output_file
-      - If output_text=True: write plain text to output_file using selected field(s)
+    Counts are based on ORIGINAL tokens (wrapper overhead doesn't exist in this script).
     """
-    # Decide morphological method:
     if use_mecab and use_spacy:
         print("Error: Please choose either MeCab or spaCy, not both.")
         sys.exit(1)
@@ -528,10 +506,33 @@ def process_japanese_text(
     else:
         morph_method = None
 
+    stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
     out_array: List[Dict[str, Any]] = []
 
+    def is_japanese_char(ch: str) -> bool:
+        o = ord(ch)
+        # Hiragana, Katakana, CJK Unified Ideographs (basic), plus common punctuation blocks are excluded intentionally.
+        return (0x3040 <= o <= 0x309F) or (0x30A0 <= o <= 0x30FF) or (0x4E00 <= o <= 0x9FFF)
+
+    def is_japanese_token(tok: str) -> bool:
+        return any(is_japanese_char(ch) for ch in tok)
+
+    def count_sentence_bytes(sentence: str) -> None:
+        # Tokenize similarly to your KR/ZH scripts for consistent accounting
+        toks = re.findall(r"\w+|[^\w\s]", sentence, re.UNICODE)
+        for tok in toks:
+            b = utf8_len(tok)
+            if re.match(r"\w+", tok):
+                if tok.isdigit():
+                    stats["not_transcribed_bytes"] += b
+                elif is_japanese_token(tok):
+                    stats["transcribed_bytes"] += b
+                else:
+                    stats["not_transcribed_bytes"] += b
+            else:
+                stats["not_transcribed_bytes"] += b
+
     if json_inplace_update:
-        # JSON input: process as JSON array.
         try:
             with open(input_file, "r", encoding="utf-8") as fin:
                 data = json.load(fin)
@@ -541,6 +542,8 @@ def process_japanese_text(
                     continue
 
                 original_text = entry["sentence"]
+                count_sentence_bytes(original_text)
+
                 hira_unspaced = to_hiragana(original_text)
                 ipa_unspaced = hiragana_to_ipa(hira_unspaced)
 
@@ -558,11 +561,7 @@ def process_japanese_text(
                     out_obj["spaced_original"] = spaced_original if spaced_original is not None else ""
                     out_obj["spaced_hira_subbed"] = spaced_hira_subbed if spaced_hira_subbed is not None else ""
                     out_obj["pos_tags"] = pos_tags if pos_tags is not None else ""
-
-                    ipa_spaced = ""
-                    if out_obj["spaced_hira_subbed"]:
-                        ipa_spaced = hiragana_to_ipa(out_obj["spaced_hira_subbed"])
-                    out_obj["spaced_ipa"] = ipa_spaced
+                    out_obj["spaced_ipa"] = hiragana_to_ipa(out_obj["spaced_hira_subbed"]) if out_obj["spaced_hira_subbed"] else ""
 
                 out_array.append(out_obj)
 
@@ -577,7 +576,6 @@ def process_japanese_text(
             return
 
     else:
-        # Plain text input: each non-blank line is treated as a sentence.
         try:
             with open(input_file, "r", encoding="utf-8") as fin:
                 lines = fin.readlines()
@@ -588,6 +586,8 @@ def process_japanese_text(
                     continue
 
                 original_text = line
+                count_sentence_bytes(original_text)
+
                 hira_unspaced = to_hiragana(original_text)
                 ipa_unspaced = hiragana_to_ipa(hira_unspaced)
 
@@ -605,11 +605,7 @@ def process_japanese_text(
                     out_obj["spaced_original"] = spaced_original if spaced_original is not None else ""
                     out_obj["spaced_hira_subbed"] = spaced_hira_subbed if spaced_hira_subbed is not None else ""
                     out_obj["pos_tags"] = pos_tags if pos_tags is not None else ""
-
-                    ipa_spaced = ""
-                    if out_obj["spaced_hira_subbed"]:
-                        ipa_spaced = hiragana_to_ipa(out_obj["spaced_hira_subbed"])
-                    out_obj["spaced_ipa"] = ipa_spaced
+                    out_obj["spaced_ipa"] = hiragana_to_ipa(out_obj["spaced_hira_subbed"]) if out_obj["spaced_hira_subbed"] else ""
 
                 out_array.append(out_obj)
 
@@ -620,7 +616,7 @@ def process_japanese_text(
             print(f"An error occurred: {e}")
             return
 
-    # OUTPUT (default unchanged: JSON)
+    # OUTPUT (unchanged)
     if output_text:
         write_text_output(
             output_file=output_file,
@@ -632,14 +628,44 @@ def process_japanese_text(
     else:
         write_json_array(output_file=output_file, out_array=out_array)
 
+    # Print + optional write stats
+    transcribed = int(stats["transcribed_bytes"])
+    not_tx = int(stats["not_transcribed_bytes"])
+    total = transcribed + not_tx
+    pct_tx = (transcribed / total * 100.0) if total else 0.0
+    pct_not = (not_tx / total * 100.0) if total else 0.0
+
+    out_stats = {
+        "transcribed_bytes": transcribed,
+        "not_transcribed_bytes": not_tx,
+        "total_bytes": total,
+        "pct_transcribed": pct_tx,
+        "pct_not_transcribed": pct_not,
+    }
+
+    print("\n=== Byte Coverage Stats (based on ORIGINAL tokens) ===")
+    print(f"Transcribed bytes      : {out_stats['transcribed_bytes']}")
+    print(f"Not transcribed bytes  : {out_stats['not_transcribed_bytes']}")
+    print(f"Total bytes (counted)  : {out_stats['total_bytes']}")
+    print(f"% transcribed          : {out_stats['pct_transcribed']:.2f}%")
+    print(f"% not transcribed      : {out_stats['pct_not_transcribed']:.2f}%")
+
+    if stats_json:
+        with open(stats_json, "w", encoding="utf-8") as sf:
+            json.dump(out_stats, sf, ensure_ascii=False, indent=2)
+        print(f"Stats JSON written to: {stats_json}")
+
 
 # ========== 7) Command-Line Entry Point ==========
 if __name__ == "__main__":
+    import re  # local import to avoid changing your top imports too much
+
     parser = argparse.ArgumentParser(
         description=(
             "Convert JP text to IPA with optional morphological spacing and POS tagging.\n"
             "DEFAULT behavior matches original: input may be JSON (-j) or plain text, output is JSON array.\n"
-            "NEW: you can output plain text with --text_output."
+            "You can output plain text with --text_output.\n"
+            "NEW: prints byte coverage stats and can write them with --stats_json."
         )
     )
     parser.add_argument(
@@ -658,42 +684,27 @@ def process_japanese_text(
     parser.add_argument(
         "-j", "--json_inplace_update",
         action="store_true",
+        help="Treat input file as JSON array and update each entry."
     )
 
     group = parser.add_mutually_exclusive_group()
-    group.add_argument(
-        "--use_mecab",
-        action="store_true",
-        help="Use MeCab for morphological tokenization (and forcing 'は' => 'わ')."
-    )
-    group.add_argument(
-        "--use_spacy",
-        action="store_true",
-        help="Use spaCy for morphological tokenization (and forcing 'は' => 'わ')."
-    )
-
-    # NEW OUTPUT MODE
-    parser.add_argument(
-        "--text_output",
-        action="store_true",
-        help="Write a plain-text output file (one line per sentence) instead of JSON."
-    )
-    parser.add_argument(
-        "--text_field",
-        default="spaced_ipa",
-        help="Which field to emit in --text_output mode (default: spaced_ipa). "
-             "Common choices: unspaced_ipa, spaced_ipa, spaced_hira_subbed, pos_tags."
-    )
-    parser.add_argument(
-        "--text_no_sentence",
-        action="store_true",
-        help="In --text_output mode, emit only the selected field (omit the original sentence)."
-    )
-    parser.add_argument(
-        "--text_sep",
-        default="\t",
-        help="Separator used between sentence and field in --text_output mode (default: tab)."
-    )
+    group.add_argument("--use_mecab", action="store_true",
+                       help="Use MeCab for morphological tokenization (and forcing 'は' => 'わ').")
+    group.add_argument("--use_spacy", action="store_true",
+                       help="Use spaCy for morphological tokenization (and forcing 'は' => 'わ').")
+
+    parser.add_argument("--text_output", action="store_true",
+                        help="Write a plain-text output file (one line per sentence) instead of JSON.")
+    parser.add_argument("--text_field", default="spaced_ipa",
+                        help="Which field to emit in --text_output mode (default: spaced_ipa). "
+                             "Common: unspaced_ipa, spaced_ipa, spaced_hira_subbed, pos_tags.")
+    parser.add_argument("--text_no_sentence", action="store_true",
+                        help="In --text_output mode, emit only the selected field (omit the original sentence).")
+    parser.add_argument("--text_sep", default="\t",
+                        help="Separator used between sentence and field in --text_output mode (default: tab).")
+
+    parser.add_argument("--stats_json", type=str, default=None,
+                        help="Optional: write byte coverage stats as JSON to this path (in addition to printing).")
 
     args = parser.parse_args()
 
@@ -707,5 +718,6 @@ def process_japanese_text(
         text_field=args.text_field,
         text_include_sentence=(not args.text_no_sentence),
         text_sep=args.text_sep,
+        stats_json=args.stats_json,
     )
 
diff --git a/data/template/utils/ko_en_to_ipa.py b/data/template/utils/ko_en_to_ipa.py
index 28e90963a7..4392b63a06 100644
--- a/data/template/utils/ko_en_to_ipa.py
+++ b/data/template/utils/ko_en_to_ipa.py
@@ -5,6 +5,17 @@
 import re
 import json
 
+WRAP_PREFIX = "[[[[["
+WRAP_SUFFIX = "]]]]]"
+
+
+def utf8_len(s: str) -> int:
+    return len(s.encode("utf-8"))
+
+
+def is_korean_token(token: str) -> bool:
+    return any('가' <= ch <= '힣' for ch in token)
+
 
 def transcribe_korean(sentence, wrapper=False):
     """Transcribe a Korean sentence into its phonemes using KoNLPy (Okt) + espeak-ng."""
@@ -25,7 +36,7 @@ def transcribe_korean(sentence, wrapper=False):
         # Check for failed transcription markers
         if "(en)" in transcription or "(ko)" in transcription:
             if wrapper:
-                return "[[[[[" + sentence + "]]]]]"
+                return f"{WRAP_PREFIX}{sentence}{WRAP_SUFFIX}"
             return sentence
 
         return transcription
@@ -37,40 +48,69 @@ def transcribe_korean(sentence, wrapper=False):
 
 def handle_mixed_language(word, wrapper=False):
     """Handle a word with potential Korean, other language, or number content."""
-    if word.isdigit():  # Detect numbers (pass through unchanged)
+    if word.isdigit():  # numbers pass through unchanged
         return word
-    elif any('가' <= char <= '힣' for char in word):  # Detect Korean
+    elif is_korean_token(word):
         return transcribe_korean(word, wrapper=wrapper)
-    else:  # Non-Korean word
+    else:  # Non-Korean
         if wrapper:
-            return "[[[[[" + word + "]]]]]"
+            return f"{WRAP_PREFIX}{word}{WRAP_SUFFIX}"
         return word
 
 
-def transcribe_plain_text(text, wrapper=False):
-    """Transcribe a plain text string into IPA, leaving non-Korean as-is (or wrapped)."""
-    result = []
+def transcribe_plain_text(
+    text,
+    wrapper=False,
+    stats=None,
+):
+    """
+    Transcribe a plain text string into IPA, leaving non-Korean as-is (or wrapped).
+
+    If stats dict is provided, it will be updated with:
+      - transcribed_bytes: UTF-8 bytes of ORIGINAL tokens that were transcribed (Korean tokens only)
+      - not_transcribed_bytes: UTF-8 bytes of ORIGINAL tokens not transcribed (includes Latin, digits, punctuation)
+    Counts are based on ORIGINAL tokens, so wrapper overhead is excluded automatically.
+    """
+    if stats is None:
+        stats = {}
+
+    stats.setdefault("transcribed_bytes", 0)
+    stats.setdefault("not_transcribed_bytes", 0)
+
+    out = []
     words = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
-    for word in words:
-        if re.match(r'\w+', word):
-            result.append(handle_mixed_language(word, wrapper=wrapper))
+    for tok in words:
+        tok_bytes = utf8_len(tok)
+
+        if re.match(r'\w+', tok):
+            if tok.isdigit():
+                stats["not_transcribed_bytes"] += tok_bytes
+            elif is_korean_token(tok):
+                stats["transcribed_bytes"] += tok_bytes
+            else:
+                stats["not_transcribed_bytes"] += tok_bytes
+
+            out.append(handle_mixed_language(tok, wrapper=wrapper))
         else:
-            result.append(word)
-    return " ".join(result)
+            # punctuation/symbols
+            stats["not_transcribed_bytes"] += tok_bytes
+            out.append(tok)
+
+    return " ".join(out)
 
 
-def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa', wrapper=False):
+def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa', wrapper=False, stats=None):
     """
     Transcribe multilingual sentences and update JSON data directly.
 
-    Args:
-        sentences: JSON string or a loaded JSON object.
-        input_json_key: Key to extract sentences from in a JSON.
-        output_json_key: Key to store IPA transcription in the JSON (default: 'ipa').
-
-    Returns:
-        The modified JSON string with IPA transcriptions added.
+    Returns the modified JSON string with IPA transcriptions added.
+    If stats dict is provided, it will be updated with byte coverage counts.
     """
+    if stats is None:
+        stats = {}
+    stats.setdefault("transcribed_bytes", 0)
+    stats.setdefault("not_transcribed_bytes", 0)
+
     try:
         data = json.loads(sentences) if isinstance(sentences, str) else sentences
         if not isinstance(data, list):
@@ -79,8 +119,8 @@ def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa
         for item in data:
             if input_json_key in item:
                 sentence = item[input_json_key]
-                transcription_result = transcribe_plain_text(sentence, wrapper=wrapper)
-                item[output_json_key] = transcription_result  # Update directly
+                transcription_result = transcribe_plain_text(sentence, wrapper=wrapper, stats=stats)
+                item[output_json_key] = transcription_result
                 print(transcription_result)
             else:
                 print(f"Warning: Key '{input_json_key}' not found in item: {item}")
@@ -92,9 +132,39 @@ def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa
     return json.dumps(data, ensure_ascii=False, indent=4)
 
 
+def finalize_and_print_stats(stats, stats_json_path=None):
+    transcribed = int(stats.get("transcribed_bytes", 0))
+    not_tx = int(stats.get("not_transcribed_bytes", 0))
+    total = transcribed + not_tx
+    pct_tx = (transcribed / total * 100.0) if total else 0.0
+    pct_not = (not_tx / total * 100.0) if total else 0.0
+
+    out_stats = {
+        "transcribed_bytes": transcribed,
+        "not_transcribed_bytes": not_tx,
+        "total_bytes": total,
+        "pct_transcribed": pct_tx,
+        "pct_not_transcribed": pct_not,
+    }
+
+    print("\n=== Byte Coverage Stats (based on ORIGINAL tokens) ===")
+    print(f"Transcribed bytes      : {out_stats['transcribed_bytes']}")
+    print(f"Not transcribed bytes  : {out_stats['not_transcribed_bytes']}")
+    print(f"Total bytes (counted)  : {out_stats['total_bytes']}")
+    print(f"% transcribed          : {out_stats['pct_transcribed']:.2f}%")
+    print(f"% not transcribed      : {out_stats['pct_not_transcribed']:.2f}%")
+
+    if stats_json_path:
+        with open(stats_json_path, "w", encoding="utf-8") as sf:
+            json.dump(out_stats, sf, ensure_ascii=False, indent=2)
+        print(f"Stats JSON written to: {stats_json_path}")
+
+    return out_stats
+
+
 def main():
     parser = argparse.ArgumentParser(
-        description='Transcribe multilingual text or JSON into IPA phonemes (Korean via espeak-ng).'
+        description='Transcribe multilingual text or JSON into IPA phonemes (Korean via espeak-ng), with byte coverage stats.'
     )
 
     parser.add_argument(
@@ -132,11 +202,20 @@ def main():
         "--wrapper",
         default=False,
         action=argparse.BooleanOptionalAction,
-        help="Wrap unparseable text with [[[[[square brackets]]]]], for later recovery."
+        help="Wrap unparseable/non-target tokens with [[[[[...]]]]]. Use --no-wrapper to leave them unchanged."
+    )
+
+    parser.add_argument(
+        "--stats_json",
+        type=str,
+        default=None,
+        help="Optional: write byte coverage stats as JSON to this path (in addition to printing)."
     )
 
     args = parser.parse_args()
 
+    stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0}
+
     try:
         with open(args.input_file, 'r', encoding='utf-8') as f:
             input_content = f.read()
@@ -145,7 +224,8 @@ def main():
         if args.text_input:
             transcription = transcribe_plain_text(
                 input_content,
-                wrapper=args.wrapper
+                wrapper=args.wrapper,
+                stats=stats
             )
 
             if args.text_output:
@@ -164,11 +244,11 @@ def main():
                 input_content,
                 args.input_json_key,
                 args.output_json_key,
-                wrapper=args.wrapper
+                wrapper=args.wrapper,
+                stats=stats
             )
 
             if updated_json_data:
-                # Default behavior: overwrite original JSON
                 if args.text_output:
                     with open(args.text_output, 'w', encoding='utf-8') as f:
                         f.write(updated_json_data)
@@ -178,6 +258,8 @@ def main():
                         f.write(updated_json_data)
                     print(f"Successfully updated JSON data in '{args.input_file}'")
 
+        finalize_and_print_stats(stats, stats_json_path=args.stats_json)
+
     except FileNotFoundError:
         print(f"Error: Input file '{args.input_file}' not found.")
     except ValueError as e:
diff --git a/data/template/utils/zh_to_ipa.py b/data/template/utils/zh_to_ipa.py
index 387bca3303..987e112487 100644
--- a/data/template/utils/zh_to_ipa.py
+++ b/data/template/utils/zh_to_ipa.py
@@ -9,6 +9,20 @@
 import json
 
 
+WRAP_PREFIX = "[[[[["
+WRAP_SUFFIX = "]]]]]"
+
+
+def utf8_len(s: str) -> int:
+    return len(s.encode("utf-8"))
+
+
+def is_chinese_token(token: str) -> bool:
+    # Keeps your original behavior: "Chinese" means "contains any simplified Hanzi"
+    # (This will miss pure-traditional-only text; expand if you want.)
+    return any(hanzi.is_simplified(ch) for ch in token)
+
+
 def transcribe_chinese(sentence: str) -> str:
     """Transcribe a Chinese sentence into its phonemes using dragonmapper."""
     try:
@@ -20,12 +34,12 @@ def transcribe_chinese(sentence: str) -> str:
 
 def handle_mixed_language(word: str, wrapper: bool = True) -> str:
     """Handle a word with potential Chinese, other language, or number content."""
-    if word.isdigit():  # Detect numbers (pass through unchanged)
+    if word.isdigit():  # numbers: passthrough
         return word
-    elif any(hanzi.is_simplified(char) for char in word):  # Detect Simplified Chinese chars
+    elif is_chinese_token(word):  # Chinese: IPA
         return transcribe_chinese(word)
-    else:  # Non-Chinese word
-        return f"[[[[[{word}]]]]]" if wrapper else word
+    else:  # Non-Chinese: wrap or passthrough
+        return f"{WRAP_PREFIX}{word}{WRAP_SUFFIX}" if wrapper else word
 
 
 def transcribe_multilingual(
@@ -39,84 +53,106 @@ def transcribe_multilingual(
     """
     Transcribe multilingual sentences (Chinese + non-Chinese passthrough/wrap) and save to a file.
 
-    Args:
-        data: The input data (list of dicts if JSON, list of strings if plain text).
-        output_file: Path to the output file.
-        json_inplace_update: If True, process JSON input and add IPA to the same JSON objects.
-        json_input_field: The field in the JSON data to transcribe (default: "sentence").
-        json_output_field: The field to write the IPA transcription to (default: "sentence_ipa").
-        wrapper: If True, wrap non-Chinese tokens like [[[[[token]]]]]. If False, leave them unchanged.
+    Also computes byte counts:
+      - transcribed_bytes: UTF-8 bytes of ORIGINAL tokens that were transcribed (Chinese tokens)
+      - not_transcribed_bytes: UTF-8 bytes of ORIGINAL tokens that were not transcribed
+        (includes Latin words, digits, punctuation, etc.)
+    These counts are based on ORIGINAL text tokens, so wrapper overhead is automatically excluded.
+
+    Returns:
+        stats dict with transcribed_bytes, not_transcribed_bytes, total_bytes, and percents.
     """
+    transcribed_bytes = 0
+    not_transcribed_bytes = 0
+
+    def process_sentence(sentence: str) -> str:
+        nonlocal transcribed_bytes, not_transcribed_bytes
+
+        # Split sentence using jieba (your original behavior)
+        seg_list = jieba.cut(sentence, cut_all=False)
+        seg_sentence = "".join(seg_list)
+
+        # Split but keep punctuation
+        words = re.findall(r"\w+|[^\w\s]", seg_sentence, re.UNICODE)
+
+        out_parts = []
+        for tok in words:
+            tok_bytes = utf8_len(tok)
+
+            if re.match(r"\w+", tok):
+                # word-ish token
+                if tok.isdigit():
+                    not_transcribed_bytes += tok_bytes
+                elif is_chinese_token(tok):
+                    transcribed_bytes += tok_bytes
+                else:
+                    not_transcribed_bytes += tok_bytes
+
+                out_parts.append(handle_mixed_language(tok, wrapper=wrapper))
+            else:
+                # punctuation / symbols
+                not_transcribed_bytes += tok_bytes
+                out_parts.append(tok)
+
+        return " ".join(out_parts)
+
     if json_inplace_update:
         # In-place update for JSON data
         for item in data:
             if json_input_field in item:
                 sentence = item[json_input_field]
-                result = []
-
-                # Split sentence using jieba
-                seg_list = jieba.cut(sentence, cut_all=False)
-                seg_sentence = "".join(seg_list)
+                item[json_output_field] = process_sentence(sentence)
 
-                # Split sentence but keep punctuation
-                words = re.findall(r'\w+|[^\w\s]', seg_sentence, re.UNICODE)
-                for word in words:
-                    if re.match(r'\w+', word):  # Only process words
-                        result.append(handle_mixed_language(word, wrapper=wrapper))
-                    else:
-                        result.append(word)  # Preserve punctuation
-
-                transcription_result = " ".join(result)
-                item[json_output_field] = transcription_result
-
-        with open(output_file, 'w', encoding='utf-8') as f:
+        with open(output_file, "w", encoding="utf-8") as f:
             json.dump(data, f, ensure_ascii=False, indent=4)
         print(f"In-place JSON transcription saved to {output_file}")
 
     else:
-        # Standard transcription (either JSON or plain text to plain text output)
-        with open(output_file, 'w', encoding='utf-8') as f:
+        # Standard transcription to plain text output (one line per item)
+        with open(output_file, "w", encoding="utf-8") as f:
             for item in data:
-                result = []
                 if isinstance(item, dict):
                     sentence = item.get(json_input_field, "")
                 else:
                     sentence = item
 
-                # Split sentence using jieba
-                seg_list = jieba.cut(sentence, cut_all=False)
-                seg_sentence = "".join(seg_list)
+                transcription_result = process_sentence(sentence)
+                f.write(transcription_result + "\n")
+                print(transcription_result)
 
-                # Split sentence but keep punctuation
-                words = re.findall(r'\w+|[^\w\s]', seg_sentence, re.UNICODE)
-                for word in words:
-                    if re.match(r'\w+', word):  # Only process words
-                        result.append(handle_mixed_language(word, wrapper=wrapper))
-                    else:
-                        result.append(word)  # Preserve punctuation
+    total_bytes = transcribed_bytes + not_transcribed_bytes
+    pct_transcribed = (transcribed_bytes / total_bytes * 100.0) if total_bytes else 0.0
+    pct_not = (not_transcribed_bytes / total_bytes * 100.0) if total_bytes else 0.0
 
-                transcription_result = " ".join(result)
-                f.write(transcription_result + "\n")
-                print(transcription_result)  # Print to console
+    stats = {
+        "transcribed_bytes": transcribed_bytes,
+        "not_transcribed_bytes": not_transcribed_bytes,
+        "total_bytes": total_bytes,
+        "pct_transcribed": pct_transcribed,
+        "pct_not_transcribed": pct_not,
+    }
+    return stats
 
 
 def main():
-    parser = argparse.ArgumentParser(description='Transcribe multilingual sentences into IPA phonemes (Chinese via dragonmapper).')
+    parser = argparse.ArgumentParser(
+        description="Transcribe multilingual sentences into IPA phonemes (Chinese via dragonmapper), with byte coverage stats."
+    )
     parser.add_argument(
-        'input_file',
+        "input_file",
         type=str,
-        help='Path to the input file containing sentences in json or text format.'
+        help="Path to the input file containing sentences in json or text format."
     )
     parser.add_argument(
-        'output_file',
+        "output_file",
         type=str,
-        help='Path to the output file for IPA transcription.'
+        help="Path to the output file for IPA transcription."
     )
     parser.add_argument(
-        '--input_type',
+        "--input_type",
         type=str,
-        choices=['json', 'text'],
-        default='json',
+        choices=["json", "text"],
+        default="json",
         help='Type of input file: "json" or "text" (default: json)'
     )
     parser.add_argument(
@@ -140,18 +176,23 @@ def main():
         action=argparse.BooleanOptionalAction,
         help="Wrap non-Chinese tokens as [[[[[...]]]]] (default: true). Use --no-wrapper to leave them unchanged."
     )
+    parser.add_argument(
+        "--stats_json",
+        type=str,
+        default=None,
+        help="Optional: write stats as JSON to this path (in addition to printing)."
+    )
 
     args = parser.parse_args()
 
     try:
-        with open(args.input_file, 'r', encoding='utf-8') as f:
-            if args.input_type == 'json':
+        with open(args.input_file, "r", encoding="utf-8") as f:
+            if args.input_type == "json":
                 data = json.load(f)
             else:
-                # Keep lines as strings; strip newline later if you want
                 data = [line.rstrip("\n") for line in f.readlines()]
 
-        transcribe_multilingual(
+        stats = transcribe_multilingual(
             data=data,
             output_file=args.output_file,
             json_inplace_update=args.json_inplace_update,
@@ -160,6 +201,19 @@ def main():
             wrapper=args.wrapper,
         )
 
+        # Print summary stats (wrapper overhead is automatically excluded because we count ORIGINAL token bytes)
+        print("\n=== Byte Coverage Stats (based on ORIGINAL tokens) ===")
+        print(f"Transcribed bytes      : {stats['transcribed_bytes']}")
+        print(f"Not transcribed bytes  : {stats['not_transcribed_bytes']}")
+        print(f"Total bytes (counted)  : {stats['total_bytes']}")
+        print(f"% transcribed          : {stats['pct_transcribed']:.2f}%")
+        print(f"% not transcribed      : {stats['pct_not_transcribed']:.2f}%")
+
+        if args.stats_json:
+            with open(args.stats_json, "w", encoding="utf-8") as sf:
+                json.dump(stats, sf, ensure_ascii=False, indent=2)
+            print(f"Stats JSON written to: {args.stats_json}")
+
     except FileNotFoundError:
         print(f"Error: Input file '{args.input_file}' not found.")
     except json.JSONDecodeError:
@@ -168,6 +222,6 @@ def main():
         print(f"An unexpected error occurred: {e}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()