diff --git a/data/flores200-res/.gitignore b/data/flores200-res/.gitignore new file mode 100644 index 0000000000..e33609d251 --- /dev/null +++ b/data/flores200-res/.gitignore @@ -0,0 +1 @@ +*.png diff --git a/data/flores200-res/README.md b/data/flores200-res/README.md new file mode 100644 index 0000000000..aab97633c7 --- /dev/null +++ b/data/flores200-res/README.md @@ -0,0 +1,20 @@ +# Scripts compatible with Flores-200 Restructured + +This is a folder with scripts compatible with the Flores-200 project, originally +from: +https://github.com/facebookresearch/flores/blob/main/README.md + +Though scripts target the restructured format proposed by muhammadravi251001: +https://huggingface.co/datasets/muhammadravi251001/restructured-flores200 + +# License of dataset + +The Flores 200 dataset is licensed under CC-By-SA 4.0. + +## Language Codes + +Language Codes here for Flore-200: +https://github.com/facebookresearch/flores/blob/main/flores200/README.md + +Language Codes here for espeak (basis of many of the phoneticizers): +https://espeak.sourceforge.net/languages.html diff --git a/data/flores200-res/eng_stats.json b/data/flores200-res/eng_stats.json new file mode 100644 index 0000000000..abcded9fae --- /dev/null +++ b/data/flores200-res/eng_stats.json @@ -0,0 +1,7 @@ +{ + "transcribed_bytes": 208634, + "not_transcribed_bytes": 7905, + "total_bytes": 216539, + "pct_transcribed": 96.34938740827288, + "pct_not_transcribed": 3.6506125917271253 +} \ No newline at end of file diff --git a/data/flores200-res/filter_files_by_script.py b/data/flores200-res/filter_files_by_script.py new file mode 100644 index 0000000000..3923918ddf --- /dev/null +++ b/data/flores200-res/filter_files_by_script.py @@ -0,0 +1,89 @@ + +#!/usr/bin/env python3 +""" +filter_files_by_script.py + +Read files.json and emit a simplified JSON with only fields +relevant to script/language analysis. + +Keeps: + - language (ISO 639-3) + - script (ISO 15924) + - lang_script (language_script) + - size_kb (float) + - filename (optional but useful) +""" + +import json +import re +import argparse + +FNAME_RE = re.compile(r"^text_([a-z]{3})_([A-Za-z]{4})\.txt$") + + +def parse_size_to_kb(size_str: str) -> float: + """ + Convert ls -h style sizes to KB. + """ + m = re.match(r"^\s*([0-9]*\.?[0-9]+)\s*([KMGTP]?)(B?)\s*$", size_str, re.IGNORECASE) + if not m: + raise ValueError(f"Unrecognized size string: {size_str!r}") + + val = float(m.group(1)) + unit = m.group(2).upper() + + mult = { + "": 1.0 / 1024.0, # bytes -> KB + "K": 1.0, + "M": 1024.0, + "G": 1024.0**2, + "T": 1024.0**3, + "P": 1024.0**4, + }[unit] + + return val * mult + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--json", default="files.json", help="Input files.json") + ap.add_argument("--out", default="filtered_scripts.json", help="Output JSON") + ap.add_argument("--drop-filename", action="store_true", + help="Do not include original filename in output") + args = ap.parse_args() + + with open(args.json, "r", encoding="utf-8") as f: + rows = json.load(f) + + filtered = [] + + for r in rows: + name = r.get("name", "") + m = FNAME_RE.match(name) + if not m: + continue + + lang, script = m.groups() + size_kb = parse_size_to_kb(str(r["size"])) + + entry = { + "language": lang, + "script": script, + "lang_script": f"{lang}_{script}", + "size_kb": size_kb, + } + + if not args.drop_filename: + entry["filename"] = name + + filtered.append(entry) + + with open(args.out, "w", encoding="utf-8") as f: + json.dump(filtered, f, indent=2, ensure_ascii=False) + + print(f"Wrote {len(filtered)} entries to {args.out}") + + +if __name__ == "__main__": + main() + diff --git a/data/flores200-res/filtered_files.json b/data/flores200-res/filtered_files.json new file mode 100644 index 0000000000..c79266cebf --- /dev/null +++ b/data/flores200-res/filtered_files.json @@ -0,0 +1,1512 @@ +[ + { + "language": "yue", + "script": "Hant", + "lang_script": "yue_Hant", + "size_kb": 221.0, + "filename": "text_yue_Hant.txt", + "tokenized_sizes": { + "tiktoken": 318.50390625 + } + }, + { + "language": "zho", + "script": "Hans", + "lang_script": "zho_Hans", + "size_kb": 235.0, + "filename": "text_zho_Hans.txt", + "tokenized_sizes": { + "tiktoken": 331.03125 + } + }, + { + "language": "ace", + "script": "Arab", + "lang_script": "ace_Arab", + "size_kb": 383.0, + "filename": "text_ace_Arab.txt", + "tokenized_sizes": { + "tiktoken": 492.009765625 + } + }, + { + "language": "ace", + "script": "Latn", + "lang_script": "ace_Latn", + "size_kb": 277.0, + "filename": "text_ace_Latn.txt", + "tokenized_sizes": { + "tiktoken": 226.68359375 + } + }, + { + "language": "acm", + "script": "Arab", + "lang_script": "acm_Arab", + "size_kb": 396.0, + "filename": "text_acm_Arab.txt", + "tokenized_sizes": { + "tiktoken": 439.6796875 + } + }, + { + "language": "acq", + "script": "Arab", + "lang_script": "acq_Arab", + "size_kb": 400.0, + "filename": "text_acq_Arab.txt", + "tokenized_sizes": { + "tiktoken": 447.302734375 + } + }, + { + "language": "aeb", + "script": "Arab", + "lang_script": "aeb_Arab", + "size_kb": 390.0, + "filename": "text_aeb_Arab.txt", + "tokenized_sizes": { + "tiktoken": 433.5234375 + } + }, + { + "language": "afr", + "script": "Latn", + "lang_script": "afr_Latn", + "size_kb": 272.0, + "filename": "text_afr_Latn.txt", + "tokenized_sizes": { + "tiktoken": 203.251953125 + } + }, + { + "language": "ajp", + "script": "Arab", + "lang_script": "ajp_Arab", + "size_kb": 377.0, + "filename": "text_ajp_Arab.txt", + "tokenized_sizes": { + "tiktoken": 415.3125 + } + }, + { + "language": "aka", + "script": "Latn", + "lang_script": "aka_Latn", + "size_kb": 279.0, + "filename": "text_aka_Latn.txt", + "tokenized_sizes": { + "tiktoken": 291.162109375 + } + }, + { + "language": "als", + "script": "Latn", + "lang_script": "als_Latn", + "size_kb": 306.0, + "filename": "text_als_Latn.txt", + "tokenized_sizes": { + "tiktoken": 276.4609375 + } + }, + { + "language": "amh", + "script": "Ethi", + "lang_script": "amh_Ethi", + "size_kb": 436.0, + "filename": "text_amh_Ethi.txt", + "tokenized_sizes": { + "tiktoken": 803.419921875 + } + }, + { + "language": "apc", + "script": "Arab", + "lang_script": "apc_Arab", + "size_kb": 376.0, + "filename": "text_apc_Arab.txt", + "tokenized_sizes": { + "tiktoken": 416.90625 + } + }, + { + "language": "arb", + "script": "Arab", + "lang_script": "arb_Arab", + "size_kb": 405.0, + "filename": "text_arb_Arab.txt", + "tokenized_sizes": { + "tiktoken": 453.7265625 + } + }, + { + "language": "arb", + "script": "Latn", + "lang_script": "arb_Latn", + "size_kb": 296.0, + "filename": "text_arb_Latn.txt", + "tokenized_sizes": { + "tiktoken": 262.181640625 + } + }, + { + "language": "ars", + "script": "Arab", + "lang_script": "ars_Arab", + "size_kb": 405.0, + "filename": "text_ars_Arab.txt", + "tokenized_sizes": { + "tiktoken": 454.37890625 + } + }, + { + "language": "ary", + "script": "Arab", + "lang_script": "ary_Arab", + "size_kb": 395.0, + "filename": "text_ary_Arab.txt", + "tokenized_sizes": { + "tiktoken": 434.587890625 + } + }, + { + "language": "arz", + "script": "Arab", + "lang_script": "arz_Arab", + "size_kb": 396.0, + "filename": "text_arz_Arab.txt", + "tokenized_sizes": { + "tiktoken": 437.30078125 + } + }, + { + "language": "asm", + "script": "Beng", + "lang_script": "asm_Beng", + "size_kb": 644.0, + "filename": "text_asm_Beng.txt", + "tokenized_sizes": { + "tiktoken": 1005.07421875 + } + }, + { + "language": "ast", + "script": "Latn", + "lang_script": "ast_Latn", + "size_kb": 270.0, + "filename": "text_ast_Latn.txt", + "tokenized_sizes": { + "tiktoken": 197.443359375 + } + }, + { + "language": "awa", + "script": "Deva", + "lang_script": "awa_Deva", + "size_kb": 634.0, + "filename": "text_awa_Deva.txt", + "tokenized_sizes": { + "tiktoken": 742.212890625 + } + }, + { + "language": "ayr", + "script": "Latn", + "lang_script": "ayr_Latn", + "size_kb": 272.0, + "filename": "text_ayr_Latn.txt", + "tokenized_sizes": { + "tiktoken": 243.201171875 + } + }, + { + "language": "azb", + "script": "Arab", + "lang_script": "azb_Arab", + "size_kb": 412.0, + "filename": "text_azb_Arab.txt", + "tokenized_sizes": { + "tiktoken": 531.052734375 + } + }, + { + "language": "azj", + "script": "Latn", + "lang_script": "azj_Latn", + "size_kb": 320.0, + "filename": "text_azj_Latn.txt", + "tokenized_sizes": { + "tiktoken": 360.34765625 + } + }, + { + "language": "bak", + "script": "Cyrl", + "lang_script": "bak_Cyrl", + "size_kb": 470.0, + "filename": "text_bak_Cyrl.txt", + "tokenized_sizes": { + "tiktoken": 620.775390625 + } + }, + { + "language": "bam", + "script": "Latn", + "lang_script": "bam_Latn", + "size_kb": 265.0, + "filename": "text_bam_Latn.txt", + "tokenized_sizes": { + "tiktoken": 277.44921875 + } + }, + { + "language": "ban", + "script": "Latn", + "lang_script": "ban_Latn", + "size_kb": 282.0, + "filename": "text_ban_Latn.txt", + "tokenized_sizes": { + "tiktoken": 207.21875 + } + }, + { + "language": "bel", + "script": "Cyrl", + "lang_script": "bel_Cyrl", + "size_kb": 523.0, + "filename": "text_bel_Cyrl.txt", + "tokenized_sizes": { + "tiktoken": 677.169921875 + } + }, + { + "language": "bem", + "script": "Latn", + "lang_script": "bem_Latn", + "size_kb": 312.0, + "filename": "text_bem_Latn.txt", + "tokenized_sizes": { + "tiktoken": 257.189453125 + } + }, + { + "language": "ben", + "script": "Beng", + "lang_script": "ben_Beng", + "size_kb": 661.0, + "filename": "text_ben_Beng.txt", + "tokenized_sizes": { + "tiktoken": 991.65625 + } + }, + { + "language": "bho", + "script": "Deva", + "lang_script": "bho_Deva", + "size_kb": 626.0, + "filename": "text_bho_Deva.txt", + "tokenized_sizes": { + "tiktoken": 740.771484375 + } + }, + { + "language": "bjn", + "script": "Arab", + "lang_script": "bjn_Arab", + "size_kb": 428.0, + "filename": "text_bjn_Arab.txt", + "tokenized_sizes": { + "tiktoken": 517.115234375 + } + }, + { + "language": "bjn", + "script": "Latn", + "lang_script": "bjn_Latn", + "size_kb": 267.0, + "filename": "text_bjn_Latn.txt", + "tokenized_sizes": { + "tiktoken": 208.12109375 + } + }, + { + "language": "bod", + "script": "Tibt", + "lang_script": "bod_Tibt", + "size_kb": 840.0, + "filename": "text_bod_Tibt.txt", + "tokenized_sizes": { + "tiktoken": 1532.25 + } + }, + { + "language": "bos", + "script": "Latn", + "lang_script": "bos_Latn", + "size_kb": 262.0, + "filename": "text_bos_Latn.txt", + "tokenized_sizes": { + "tiktoken": 229.4296875 + } + }, + { + "language": "bug", + "script": "Latn", + "lang_script": "bug_Latn", + "size_kb": 278.0, + "filename": "text_bug_Latn.txt", + "tokenized_sizes": { + "tiktoken": 230.97265625 + } + }, + { + "language": "bul", + "script": "Cyrl", + "lang_script": "bul_Cyrl", + "size_kb": 479.0, + "filename": "text_bul_Cyrl.txt", + "tokenized_sizes": { + "tiktoken": 569.134765625 + } + }, + { + "language": "cat", + "script": "Latn", + "lang_script": "cat_Latn", + "size_kb": 285.0, + "filename": "text_cat_Latn.txt", + "tokenized_sizes": { + "tiktoken": 201.0390625 + } + }, + { + "language": "ceb", + "script": "Latn", + "lang_script": "ceb_Latn", + "size_kb": 304.0, + "filename": "text_ceb_Latn.txt", + "tokenized_sizes": { + "tiktoken": 234.4296875 + } + }, + { + "language": "ces", + "script": "Latn", + "lang_script": "ces_Latn", + "size_kb": 274.0, + "filename": "text_ces_Latn.txt", + "tokenized_sizes": { + "tiktoken": 273.87109375 + } + }, + { + "language": "cjk", + "script": "Latn", + "lang_script": "cjk_Latn", + "size_kb": 272.0, + "filename": "text_cjk_Latn.txt", + "tokenized_sizes": { + "tiktoken": 226.83984375 + } + }, + { + "language": "ckb", + "script": "Arab", + "lang_script": "ckb_Arab", + "size_kb": 287.0, + "filename": "text_ckb_Arab.txt", + "tokenized_sizes": { + "tiktoken": 423.845703125 + } + }, + { + "language": "crh", + "script": "Latn", + "lang_script": "crh_Latn", + "size_kb": 285.0, + "filename": "text_crh_Latn.txt", + "tokenized_sizes": { + "tiktoken": 258.728515625 + } + }, + { + "language": "cym", + "script": "Latn", + "lang_script": "cym_Latn", + "size_kb": 272.0, + "filename": "text_cym_Latn.txt", + "tokenized_sizes": { + "tiktoken": 246.384765625 + } + }, + { + "language": "dan", + "script": "Latn", + "lang_script": "dan_Latn", + "size_kb": 266.0, + "filename": "text_dan_Latn.txt", + "tokenized_sizes": { + "tiktoken": 198.712890625 + } + }, + { + "language": "deu", + "script": "Latn", + "lang_script": "deu_Latn", + "size_kb": 301.0, + "filename": "text_deu_Latn.txt", + "tokenized_sizes": { + "tiktoken": 224.591796875 + } + }, + { + "language": "dik", + "script": "Latn", + "lang_script": "dik_Latn", + "size_kb": 245.0, + "filename": "text_dik_Latn.txt", + "tokenized_sizes": { + "tiktoken": 258.310546875 + } + }, + { + "language": "dyu", + "script": "Latn", + "lang_script": "dyu_Latn", + "size_kb": 272.0, + "filename": "text_dyu_Latn.txt", + "tokenized_sizes": { + "tiktoken": 230.681640625 + } + }, + { + "language": "dzo", + "script": "Tibt", + "lang_script": "dzo_Tibt", + "size_kb": 921.0, + "filename": "text_dzo_Tibt.txt", + "tokenized_sizes": { + "tiktoken": 1679.669921875 + } + }, + { + "language": "ell", + "script": "Grek", + "lang_script": "ell_Grek", + "size_kb": 550.0, + "filename": "text_ell_Grek.txt", + "tokenized_sizes": { + "tiktoken": 674.869140625 + } + }, + { + "language": "eng", + "script": "Latn", + "lang_script": "eng_Latn", + "size_kb": 254.0, + "filename": "text_eng_Latn.txt", + "tokenized_sizes": { + "tiktoken": 107.015625 + } + }, + { + "language": "epo", + "script": "Latn", + "lang_script": "epo_Latn", + "size_kb": 258.0, + "filename": "text_epo_Latn.txt", + "tokenized_sizes": { + "tiktoken": 212.609375 + } + }, + { + "language": "est", + "script": "Latn", + "lang_script": "est_Latn", + "size_kb": 257.0, + "filename": "text_est_Latn.txt", + "tokenized_sizes": { + "tiktoken": 221.576171875 + } + }, + { + "language": "eus", + "script": "Latn", + "lang_script": "eus_Latn", + "size_kb": 270.0, + "filename": "text_eus_Latn.txt", + "tokenized_sizes": { + "tiktoken": 220.109375 + } + }, + { + "language": "ewe", + "script": "Latn", + "lang_script": "ewe_Latn", + "size_kb": 271.0, + "filename": "text_ewe_Latn.txt", + "tokenized_sizes": { + "tiktoken": 301.962890625 + } + }, + { + "language": "fao", + "script": "Latn", + "lang_script": "fao_Latn", + "size_kb": 278.0, + "filename": "text_fao_Latn.txt", + "tokenized_sizes": { + "tiktoken": 249.02734375 + } + }, + { + "language": "fij", + "script": "Latn", + "lang_script": "fij_Latn", + "size_kb": 297.0, + "filename": "text_fij_Latn.txt", + "tokenized_sizes": { + "tiktoken": 240.689453125 + } + }, + { + "language": "fin", + "script": "Latn", + "lang_script": "fin_Latn", + "size_kb": 281.0, + "filename": "text_fin_Latn.txt", + "tokenized_sizes": { + "tiktoken": 238.443359375 + } + }, + { + "language": "fon", + "script": "Latn", + "lang_script": "fon_Latn", + "size_kb": 320.0, + "filename": "text_fon_Latn.txt", + "tokenized_sizes": { + "tiktoken": 422.541015625 + } + }, + { + "language": "fra", + "script": "Latn", + "lang_script": "fra_Latn", + "size_kb": 313.0, + "filename": "text_fra_Latn.txt", + "tokenized_sizes": { + "tiktoken": 209.17578125 + } + }, + { + "language": "fur", + "script": "Latn", + "lang_script": "fur_Latn", + "size_kb": 287.0, + "filename": "text_fur_Latn.txt", + "tokenized_sizes": { + "tiktoken": 216.59375 + } + }, + { + "language": "fuv", + "script": "Latn", + "lang_script": "fuv_Latn", + "size_kb": 243.0, + "filename": "text_fuv_Latn.txt", + "tokenized_sizes": { + "tiktoken": 208.873046875 + } + }, + { + "language": "gaz", + "script": "Latn", + "lang_script": "gaz_Latn", + "size_kb": 305.0, + "filename": "text_gaz_Latn.txt", + "tokenized_sizes": { + "tiktoken": 264.150390625 + } + }, + { + "language": "gla", + "script": "Latn", + "lang_script": "gla_Latn", + "size_kb": 325.0, + "filename": "text_gla_Latn.txt", + "tokenized_sizes": { + "tiktoken": 281.35546875 + } + }, + { + "language": "gle", + "script": "Latn", + "lang_script": "gle_Latn", + "size_kb": 312.0, + "filename": "text_gle_Latn.txt", + "tokenized_sizes": { + "tiktoken": 266.9453125 + } + }, + { + "language": "glg", + "script": "Latn", + "lang_script": "glg_Latn", + "size_kb": 287.0, + "filename": "text_glg_Latn.txt", + "tokenized_sizes": { + "tiktoken": 200.67578125 + } + }, + { + "language": "grn", + "script": "Latn", + "lang_script": "grn_Latn", + "size_kb": 275.0, + "filename": "text_grn_Latn.txt", + "tokenized_sizes": { + "tiktoken": 256.6171875 + } + }, + { + "language": "guj", + "script": "Gujr", + "lang_script": "guj_Gujr", + "size_kb": 633.0, + "filename": "text_guj_Gujr.txt", + "tokenized_sizes": { + "tiktoken": 1259.890625 + } + }, + { + "language": "hat", + "script": "Latn", + "lang_script": "hat_Latn", + "size_kb": 240.0, + "filename": "text_hat_Latn.txt", + "tokenized_sizes": { + "tiktoken": 200.294921875 + } + }, + { + "language": "hau", + "script": "Latn", + "lang_script": "hau_Latn", + "size_kb": 274.0, + "filename": "text_hau_Latn.txt", + "tokenized_sizes": { + "tiktoken": 225.41015625 + } + }, + { + "language": "heb", + "script": "Hebr", + "lang_script": "heb_Hebr", + "size_kb": 352.0, + "filename": "text_heb_Hebr.txt", + "tokenized_sizes": { + "tiktoken": 452.626953125 + } + }, + { + "language": "hin", + "script": "Deva", + "lang_script": "hin_Deva", + "size_kb": 646.0, + "filename": "text_hin_Deva.txt", + "tokenized_sizes": { + "tiktoken": 769.654296875 + } + }, + { + "language": "hne", + "script": "Deva", + "lang_script": "hne_Deva", + "size_kb": 624.0, + "filename": "text_hne_Deva.txt", + "tokenized_sizes": { + "tiktoken": 743.744140625 + } + }, + { + "language": "hrv", + "script": "Latn", + "lang_script": "hrv_Latn", + "size_kb": 256.0, + "filename": "text_hrv_Latn.txt", + "tokenized_sizes": { + "tiktoken": 224.17578125 + } + }, + { + "language": "hun", + "script": "Latn", + "lang_script": "hun_Latn", + "size_kb": 293.0, + "filename": "text_hun_Latn.txt", + "tokenized_sizes": { + "tiktoken": 277.033203125 + } + }, + { + "language": "hye", + "script": "Armn", + "lang_script": "hye_Armn", + "size_kb": 518.0, + "filename": "text_hye_Armn.txt", + "tokenized_sizes": { + "tiktoken": 1028.42578125 + } + }, + { + "language": "ibo", + "script": "Latn", + "lang_script": "ibo_Latn", + "size_kb": 306.0, + "filename": "text_ibo_Latn.txt", + "tokenized_sizes": { + "tiktoken": 355.85546875 + } + }, + { + "language": "ilo", + "script": "Latn", + "lang_script": "ilo_Latn", + "size_kb": 307.0, + "filename": "text_ilo_Latn.txt", + "tokenized_sizes": { + "tiktoken": 236.40234375 + } + }, + { + "language": "ind", + "script": "Latn", + "lang_script": "ind_Latn", + "size_kb": 275.0, + "filename": "text_ind_Latn.txt", + "tokenized_sizes": { + "tiktoken": 207.615234375 + } + }, + { + "language": "isl", + "script": "Latn", + "lang_script": "isl_Latn", + "size_kb": 277.0, + "filename": "text_isl_Latn.txt", + "tokenized_sizes": { + "tiktoken": 253.580078125 + } + }, + { + "language": "ita", + "script": "Latn", + "lang_script": "ita_Latn", + "size_kb": 301.0, + "filename": "text_ita_Latn.txt", + "tokenized_sizes": { + "tiktoken": 210.482421875 + } + }, + { + "language": "jav", + "script": "Latn", + "lang_script": "jav_Latn", + "size_kb": 264.0, + "filename": "text_jav_Latn.txt", + "tokenized_sizes": { + "tiktoken": 203.388671875 + } + }, + { + "language": "jpn", + "script": "Jpan", + "lang_script": "jpn_Jpan", + "size_kb": 322.0, + "filename": "text_jpn_Jpan.txt", + "tokenized_sizes": { + "tiktoken": 309.5625 + } + }, + { + "language": "kab", + "script": "Latn", + "lang_script": "kab_Latn", + "size_kb": 268.0, + "filename": "text_kab_Latn.txt", + "tokenized_sizes": { + "tiktoken": 260.916015625 + } + }, + { + "language": "kac", + "script": "Latn", + "lang_script": "kac_Latn", + "size_kb": 322.0, + "filename": "text_kac_Latn.txt", + "tokenized_sizes": { + "tiktoken": 275.46875 + } + }, + { + "language": "kam", + "script": "Latn", + "lang_script": "kam_Latn", + "size_kb": 258.0, + "filename": "text_kam_Latn.txt", + "tokenized_sizes": { + "tiktoken": 242.33984375 + } + }, + { + "language": "kan", + "script": "Knda", + "lang_script": "kan_Knda", + "size_kb": 718.0, + "filename": "text_kan_Knda.txt", + "tokenized_sizes": { + "tiktoken": 1404.755859375 + } + }, + { + "language": "kas", + "script": "Arab", + "lang_script": "kas_Arab", + "size_kb": 437.0, + "filename": "text_kas_Arab.txt", + "tokenized_sizes": { + "tiktoken": 636.080078125 + } + }, + { + "language": "kas", + "script": "Deva", + "lang_script": "kas_Deva", + "size_kb": 608.0, + "filename": "text_kas_Deva.txt", + "tokenized_sizes": { + "tiktoken": 725.751953125 + } + }, + { + "language": "kat", + "script": "Geor", + "lang_script": "kat_Geor", + "size_kb": 747.0, + "filename": "text_kat_Geor.txt", + "tokenized_sizes": { + "tiktoken": 1425.55078125 + } + }, + { + "language": "kaz", + "script": "Cyrl", + "lang_script": "kaz_Cyrl", + "size_kb": 478.0, + "filename": "text_kaz_Cyrl.txt", + "tokenized_sizes": { + "tiktoken": 611.3046875 + } + }, + { + "language": "kbp", + "script": "Latn", + "lang_script": "kbp_Latn", + "size_kb": 348.0, + "filename": "text_kbp_Latn.txt", + "tokenized_sizes": { + "tiktoken": 503.822265625 + } + }, + { + "language": "kea", + "script": "Latn", + "lang_script": "kea_Latn", + "size_kb": 258.0, + "filename": "text_kea_Latn.txt", + "tokenized_sizes": { + "tiktoken": 203.4453125 + } + }, + { + "language": "khk", + "script": "Cyrl", + "lang_script": "khk_Cyrl", + "size_kb": 485.0, + "filename": "text_khk_Cyrl.txt", + "tokenized_sizes": { + "tiktoken": 662.623046875 + } + }, + { + "language": "khm", + "script": "Khmr", + "lang_script": "khm_Khmr", + "size_kb": 845.0, + "filename": "text_khm_Khmr.txt", + "tokenized_sizes": { + "tiktoken": 1578.11328125 + } + }, + { + "language": "kik", + "script": "Latn", + "lang_script": "kik_Latn", + "size_kb": 329.0, + "filename": "text_kik_Latn.txt", + "tokenized_sizes": { + "tiktoken": 357.6328125 + } + }, + { + "language": "kin", + "script": "Latn", + "lang_script": "kin_Latn", + "size_kb": 288.0, + "filename": "text_kin_Latn.txt", + "tokenized_sizes": { + "tiktoken": 247.646484375 + } + }, + { + "language": "kir", + "script": "Cyrl", + "lang_script": "kir_Cyrl", + "size_kb": 477.0, + "filename": "text_kir_Cyrl.txt", + "tokenized_sizes": { + "tiktoken": 592.859375 + } + }, + { + "language": "kmb", + "script": "Latn", + "lang_script": "kmb_Latn", + "size_kb": 282.0, + "filename": "text_kmb_Latn.txt", + "tokenized_sizes": { + "tiktoken": 244.283203125 + } + }, + { + "language": "kmr", + "script": "Latn", + "lang_script": "kmr_Latn", + "size_kb": 279.0, + "filename": "text_kmr_Latn.txt", + "tokenized_sizes": { + "tiktoken": 256.443359375 + } + }, + { + "language": "knc", + "script": "Arab", + "lang_script": "knc_Arab", + "size_kb": 405.0, + "filename": "text_knc_Arab.txt", + "tokenized_sizes": { + "tiktoken": 488.65234375 + } + }, + { + "language": "knc", + "script": "Latn", + "lang_script": "knc_Latn", + "size_kb": 282.0, + "filename": "text_knc_Latn.txt", + "tokenized_sizes": { + "tiktoken": 269.248046875 + } + }, + { + "language": "kon", + "script": "Latn", + "lang_script": "kon_Latn", + "size_kb": 289.0, + "filename": "text_kon_Latn.txt", + "tokenized_sizes": { + "tiktoken": 227.642578125 + } + }, + { + "language": "kor", + "script": "Hang", + "lang_script": "kor_Hang", + "size_kb": 304.0, + "filename": "text_kor_Hang.txt", + "tokenized_sizes": { + "tiktoken": 523.220703125 + } + }, + { + "language": "lao", + "script": "Laoo", + "lang_script": "lao_Laoo", + "size_kb": 692.0, + "filename": "text_lao_Laoo.txt", + "tokenized_sizes": { + "tiktoken": 1354.791015625 + } + }, + { + "language": "lij", + "script": "Latn", + "lang_script": "lij_Latn", + "size_kb": 296.0, + "filename": "text_lij_Latn.txt", + "tokenized_sizes": { + "tiktoken": 239.205078125 + } + }, + { + "language": "lim", + "script": "Latn", + "lang_script": "lim_Latn", + "size_kb": 272.0, + "filename": "text_lim_Latn.txt", + "tokenized_sizes": { + "tiktoken": 214.634765625 + } + }, + { + "language": "lin", + "script": "Latn", + "lang_script": "lin_Latn", + "size_kb": 274.0, + "filename": "text_lin_Latn.txt", + "tokenized_sizes": { + "tiktoken": 212.880859375 + } + }, + { + "language": "lit", + "script": "Latn", + "lang_script": "lit_Latn", + "size_kb": 270.0, + "filename": "text_lit_Latn.txt", + "tokenized_sizes": { + "tiktoken": 256.07421875 + } + }, + { + "language": "lmo", + "script": "Latn", + "lang_script": "lmo_Latn", + "size_kb": 294.0, + "filename": "text_lmo_Latn.txt", + "tokenized_sizes": { + "tiktoken": 247.615234375 + } + }, + { + "language": "ltg", + "script": "Latn", + "lang_script": "ltg_Latn", + "size_kb": 266.0, + "filename": "text_ltg_Latn.txt", + "tokenized_sizes": { + "tiktoken": 250.259765625 + } + }, + { + "language": "ltz", + "script": "Latn", + "lang_script": "ltz_Latn", + "size_kb": 292.0, + "filename": "text_ltz_Latn.txt", + "tokenized_sizes": { + "tiktoken": 235.171875 + } + }, + { + "language": "lua", + "script": "Latn", + "lang_script": "lua_Latn", + "size_kb": 274.0, + "filename": "text_lua_Latn.txt", + "tokenized_sizes": { + "tiktoken": 223.138671875 + } + }, + { + "language": "lug", + "script": "Latn", + "lang_script": "lug_Latn", + "size_kb": 262.0, + "filename": "text_lug_Latn.txt", + "tokenized_sizes": { + "tiktoken": 227.66015625 + } + }, + { + "language": "luo", + "script": "Latn", + "lang_script": "luo_Latn", + "size_kb": 266.0, + "filename": "text_luo_Latn.txt", + "tokenized_sizes": { + "tiktoken": 214.599609375 + } + }, + { + "language": "lus", + "script": "Latn", + "lang_script": "lus_Latn", + "size_kb": 279.0, + "filename": "text_lus_Latn.txt", + "tokenized_sizes": { + "tiktoken": 219.353515625 + } + }, + { + "language": "lvs", + "script": "Latn", + "lang_script": "lvs_Latn", + "size_kb": 283.0, + "filename": "text_lvs_Latn.txt", + "tokenized_sizes": { + "tiktoken": 265.765625 + } + }, + { + "language": "mag", + "script": "Deva", + "lang_script": "mag_Deva", + "size_kb": 625.0, + "filename": "text_mag_Deva.txt", + "tokenized_sizes": { + "tiktoken": 745.328125 + } + }, + { + "language": "mai", + "script": "Deva", + "lang_script": "mai_Deva", + "size_kb": 641.0, + "filename": "text_mai_Deva.txt", + "tokenized_sizes": { + "tiktoken": 767.1484375 + } + }, + { + "language": "mal", + "script": "Mlym", + "lang_script": "mal_Mlym", + "size_kb": 787.0, + "filename": "text_mal_Mlym.txt", + "tokenized_sizes": { + "tiktoken": 1565.658203125 + } + }, + { + "language": "mar", + "script": "Deva", + "lang_script": "mar_Deva", + "size_kb": 677.0, + "filename": "text_mar_Deva.txt", + "tokenized_sizes": { + "tiktoken": 810.67578125 + } + }, + { + "language": "min", + "script": "Arab", + "lang_script": "min_Arab", + "size_kb": 441.0, + "filename": "text_min_Arab.txt", + "tokenized_sizes": { + "tiktoken": 539.71875 + } + }, + { + "language": "min", + "script": "Latn", + "lang_script": "min_Latn", + "size_kb": 271.0, + "filename": "text_min_Latn.txt", + "tokenized_sizes": { + "tiktoken": 206.884765625 + } + }, + { + "language": "mkd", + "script": "Cyrl", + "lang_script": "mkd_Cyrl", + "size_kb": 480.0, + "filename": "text_mkd_Cyrl.txt", + "tokenized_sizes": { + "tiktoken": 564.958984375 + } + }, + { + "language": "mlt", + "script": "Latn", + "lang_script": "mlt_Latn", + "size_kb": 295.0, + "filename": "text_mlt_Latn.txt", + "tokenized_sizes": { + "tiktoken": 281.068359375 + } + }, + { + "language": "mni", + "script": "Beng", + "lang_script": "mni_Beng", + "size_kb": 701.0, + "filename": "text_mni_Beng.txt", + "tokenized_sizes": { + "tiktoken": 1048.951171875 + } + }, + { + "language": "mos", + "script": "Latn", + "lang_script": "mos_Latn", + "size_kb": 262.0, + "filename": "text_mos_Latn.txt", + "tokenized_sizes": { + "tiktoken": 264.98828125 + } + }, + { + "language": "mri", + "script": "Latn", + "lang_script": "mri_Latn", + "size_kb": 294.0, + "filename": "text_mri_Latn.txt", + "tokenized_sizes": { + "tiktoken": 255.35546875 + } + }, + { + "language": "mya", + "script": "Mymr", + "lang_script": "mya_Mymr", + "size_kb": 890.0, + "filename": "text_mya_Mymr.txt", + "tokenized_sizes": { + "tiktoken": 1738.203125 + } + }, + { + "language": "nld", + "script": "Latn", + "lang_script": "nld_Latn", + "size_kb": 283.0, + "filename": "text_nld_Latn.txt", + "tokenized_sizes": { + "tiktoken": 206.50390625 + } + }, + { + "language": "nno", + "script": "Latn", + "lang_script": "nno_Latn", + "size_kb": 263.0, + "filename": "text_nno_Latn.txt", + "tokenized_sizes": { + "tiktoken": 202.712890625 + } + }, + { + "language": "nob", + "script": "Latn", + "lang_script": "nob_Latn", + "size_kb": 261.0, + "filename": "text_nob_Latn.txt", + "tokenized_sizes": { + "tiktoken": 195.431640625 + } + }, + { + "language": "npi", + "script": "Deva", + "lang_script": "npi_Deva", + "size_kb": 650.0, + "filename": "text_npi_Deva.txt", + "tokenized_sizes": { + "tiktoken": 782.9453125 + } + }, + { + "language": "nso", + "script": "Latn", + "lang_script": "nso_Latn", + "size_kb": 298.0, + "filename": "text_nso_Latn.txt", + "tokenized_sizes": { + "tiktoken": 242.853515625 + } + }, + { + "language": "nus", + "script": "Latn", + "lang_script": "nus_Latn", + "size_kb": 335.0, + "filename": "text_nus_Latn.txt", + "tokenized_sizes": { + "tiktoken": 436.869140625 + } + }, + { + "language": "nya", + "script": "Latn", + "lang_script": "nya_Latn", + "size_kb": 285.0, + "filename": "text_nya_Latn.txt", + "tokenized_sizes": { + "tiktoken": 237.115234375 + } + }, + { + "language": "oci", + "script": "Latn", + "lang_script": "oci_Latn", + "size_kb": 298.0, + "filename": "text_oci_Latn.txt", + "tokenized_sizes": { + "tiktoken": 216.3203125 + } + }, + { + "language": "ory", + "script": "Orya", + "lang_script": "ory_Orya", + "size_kb": 693.0, + "filename": "text_ory_Orya.txt", + "tokenized_sizes": { + "tiktoken": 1374.439453125 + } + }, + { + "language": "pag", + "script": "Latn", + "lang_script": "pag_Latn", + "size_kb": 253.0, + "filename": "text_pag_Latn.txt", + "tokenized_sizes": { + "tiktoken": 174.919921875 + } + }, + { + "language": "pan", + "script": "Guru", + "lang_script": "pan_Guru", + "size_kb": 657.0, + "filename": "text_pan_Guru.txt", + "tokenized_sizes": { + "tiktoken": 815.029296875 + } + }, + { + "language": "pap", + "script": "Latn", + "lang_script": "pap_Latn", + "size_kb": 274.0, + "filename": "text_pap_Latn.txt", + "tokenized_sizes": { + "tiktoken": 208.033203125 + } + }, + { + "language": "pbt", + "script": "Arab", + "lang_script": "pbt_Arab", + "size_kb": 421.0, + "filename": "text_pbt_Arab.txt", + "tokenized_sizes": { + "tiktoken": 554.59375 + } + }, + { + "language": "pes", + "script": "Arab", + "lang_script": "pes_Arab", + "size_kb": 430.0, + "filename": "text_pes_Arab.txt", + "tokenized_sizes": { + "tiktoken": 547.0625 + } + }, + { + "language": "plt", + "script": "Latn", + "lang_script": "plt_Latn", + "size_kb": 319.0, + "filename": "text_plt_Latn.txt", + "tokenized_sizes": { + "tiktoken": 270.25390625 + } + }, + { + "language": "pol", + "script": "Latn", + "lang_script": "pol_Latn", + "size_kb": 286.0, + "filename": "text_pol_Latn.txt", + "tokenized_sizes": { + "tiktoken": 280.4609375 + } + }, + { + "language": "por", + "script": "Latn", + "lang_script": "por_Latn", + "size_kb": 283.0, + "filename": "text_por_Latn.txt", + "tokenized_sizes": { + "tiktoken": 202.765625 + } + }, + { + "language": "prs", + "script": "Arab", + "lang_script": "prs_Arab", + "size_kb": 413.0, + "filename": "text_prs_Arab.txt", + "tokenized_sizes": { + "tiktoken": 526.087890625 + } + }, + { + "language": "quy", + "script": "Latn", + "lang_script": "quy_Latn", + "size_kb": 273.0, + "filename": "text_quy_Latn.txt", + "tokenized_sizes": { + "tiktoken": 230.93359375 + } + }, + { + "language": "ron", + "script": "Latn", + "lang_script": "ron_Latn", + "size_kb": 301.0, + "filename": "text_ron_Latn.txt", + "tokenized_sizes": { + "tiktoken": 259.185546875 + } + }, + { + "language": "vie", + "script": "Latn", + "lang_script": "vie_Latn", + "size_kb": 360.0, + "filename": "text_vie_Latn.txt", + "tokenized_sizes": { + "tiktoken": 470.21875 + } + }, + { + "language": "swh", + "script": "Latn", + "lang_script": "swh_Latn", + "size_kb": 272.0, + "filename": "text_swh_Latn.txt", + "tokenized_sizes": { + "tiktoken": 223.5703125 + } + } +] \ No newline at end of file diff --git a/data/flores200-res/get_dataset.sh b/data/flores200-res/get_dataset.sh index deda56df65..db841c7057 100644 --- a/data/flores200-res/get_dataset.sh +++ b/data/flores200-res/get_dataset.sh @@ -1,25 +1,227 @@ #!/bin/bash -### Instructions: -# 1. Replace "INSERT_URL_WITH_FILES" with the actual URL to the Parquet files. -# 2. Modify the "include_keys" array to specify the keys you want to include in the output. -# 3. (Optionally) Modify the "value_prefixes" array to set prefixes for each value, use "" for empty prefixes -# 4. Set "--skip_empty" to true if you want to skip empty fields, or false if not needed. -# 5. Set "--no_output_text" to true if you plan to process the intermediate json files in a custom manner. -# 6. For CSV files with BOM headers, pass "--input_encoding utf-8-sig" to the helper script. -# 7. For CSV cells that contain multi-line text, use "--split_multiline_values" to emit one line per entry or -# "--newline_replacement" to substitute newline characters with custom text. - -# Run the Python script with the specified arguments lang_array=( "text_eng_Latn" "text_jpn_Jpan" "text_kor_Hang" "text_zho_Hans" - "text_shn_Mymr" + "text_yue_Hant" + "text_hin_Deva" + "text_vie_Latn" + "text_ind_Latn" + "text_swh_Latn" + "text_ell_Grek" + "text_fra_Latn" ) +# lang_array=( +# "text_ace_Arab" +# "text_ace_Latn" +# "text_acm_Arab" +# "text_acq_Arab" +# "text_aeb_Arab" +# "text_afr_Latn" +# "text_ajp_Arab" +# "text_aka_Latn" +# "text_als_Latn" +# "text_amh_Ethi" +# "text_apc_Arab" +# "text_arb_Arab" +# "text_arb_Latn" +# "text_ars_Arab" +# "text_ary_Arab" +# "text_arz_Arab" +# "text_asm_Beng" +# "text_ast_Latn" +# "text_awa_Deva" +# "text_ayr_Latn" +# "text_azb_Arab" +# "text_azj_Latn" +# "text_bak_Cyrl" +# "text_bam_Latn" +# "text_ban_Latn" +# "text_bel_Cyrl" +# "text_bem_Latn" +# "text_ben_Beng" +# "text_bho_Deva" +# "text_bjn_Arab" +# "text_bjn_Latn" +# "text_bod_Tibt" +# "text_bos_Latn" +# "text_bug_Latn" +# "text_bul_Cyrl" +# "text_cat_Latn" +# "text_ceb_Latn" +# "text_ces_Latn" +# "text_cjk_Latn" +# "text_ckb_Arab" +# "text_crh_Latn" +# "text_cym_Latn" +# "text_dan_Latn" +# "text_deu_Latn" +# "text_dik_Latn" +# "text_dyu_Latn" +# "text_dzo_Tibt" +# "text_ell_Grek" +# "text_eng_Latn" +# "text_epo_Latn" +# "text_est_Latn" +# "text_eus_Latn" +# "text_ewe_Latn" +# "text_fao_Latn" +# "text_fij_Latn" +# "text_fin_Latn" +# "text_fon_Latn" +# "text_fra_Latn" +# "text_fur_Latn" +# "text_fuv_Latn" +# "text_gaz_Latn" +# "text_gla_Latn" +# "text_gle_Latn" +# "text_glg_Latn" +# "text_grn_Latn" +# "text_guj_Gujr" +# "text_hat_Latn" +# "text_hau_Latn" +# "text_heb_Hebr" +# "text_hin_Deva" +# "text_hne_Deva" +# "text_hrv_Latn" +# "text_hun_Latn" +# "text_hye_Armn" +# "text_ibo_Latn" +# "text_ilo_Latn" +# "text_ind_Latn" +# "text_isl_Latn" +# "text_ita_Latn" +# "text_jav_Latn" +# "text_jpn_Jpan" +# "text_kab_Latn" +# "text_kac_Latn" +# "text_kam_Latn" +# "text_kan_Knda" +# "text_kas_Arab" +# "text_kas_Deva" +# "text_kat_Geor" +# "text_kaz_Cyrl" +# "text_kbp_Latn" +# "text_kea_Latn" +# "text_khk_Cyrl" +# "text_khm_Khmr" +# "text_kik_Latn" +# "text_kin_Latn" +# "text_kir_Cyrl" +# "text_kmb_Latn" +# "text_kmr_Latn" +# "text_knc_Arab" +# "text_knc_Latn" +# "text_kon_Latn" +# "text_kor_Hang" +# "text_lao_Laoo" +# "text_lij_Latn" +# "text_lim_Latn" +# "text_lin_Latn" +# "text_lit_Latn" +# "text_lmo_Latn" +# "text_ltg_Latn" +# "text_ltz_Latn" +# "text_lua_Latn" +# "text_lug_Latn" +# "text_luo_Latn" +# "text_lus_Latn" +# "text_lvs_Latn" +# "text_mag_Deva" +# "text_mai_Deva" +# "text_mal_Mlym" +# "text_mar_Deva" +# "text_min_Arab" +# "text_min_Latn" +# "text_mkd_Cyrl" +# "text_mlt_Latn" +# "text_mni_Beng" +# "text_mos_Latn" +# "text_mri_Latn" +# "text_mya_Mymr" +# "text_nld_Latn" +# "text_nno_Latn" +# "text_nob_Latn" +# "text_npi_Deva" +# "text_nso_Latn" +# "text_nus_Latn" +# "text_nya_Latn" +# "text_oci_Latn" +# "text_ory_Orya" +# "text_pag_Latn" +# "text_pan_Guru" +# "text_pap_Latn" +# "text_pbt_Arab" +# "text_pes_Arab" +# "text_plt_Latn" +# "text_pol_Latn" +# "text_por_Latn" +# "text_prs_Arab" +# "text_quy_Latn" +# "text_ron_Latn" +# "text_run_Latn" +# "text_rus_Cyrl" +# "text_sag_Latn" +# "text_san_Deva" +# "text_sat_Olck" +# "text_scn_Latn" +# "text_shn_Mymr" +# "text_sin_Sinh" +# "text_slk_Latn" +# "text_slv_Latn" +# "text_smo_Latn" +# "text_sna_Latn" +# "text_snd_Arab" +# "text_som_Latn" +# "text_sot_Latn" +# "text_spa_Latn" +# "text_srd_Latn" +# "text_srp_Cyrl" +# "text_ssw_Latn" +# "text_sun_Latn" +# "text_swe_Latn" +# "text_swh_Latn" +# "text_szl_Latn" +# "text_tam_Taml" +# "text_taq_Latn" +# "text_taq_Tfng" +# "text_tat_Cyrl" +# "text_tel_Telu" +# "text_tgk_Cyrl" +# "text_tgl_Latn" +# "text_tha_Thai" +# "text_tir_Ethi" +# "text_tpi_Latn" +# "text_tsn_Latn" +# "text_tso_Latn" +# "text_tuk_Latn" +# "text_tum_Latn" +# "text_tur_Latn" +# "text_twi_Latn" +# "text_tzm_Tfng" +# "text_uig_Arab" +# "text_ukr_Cyrl" +# "text_umb_Latn" +# "text_urd_Arab" +# "text_uzn_Latn" +# "text_vec_Latn" +# "text_vie_Latn" +# "text_war_Latn" +# "text_wol_Latn" +# "text_xho_Latn" +# "text_ydd_Hebr" +# "text_yor_Latn" +# "text_yue_Hant" +# "text_zho_Hans" +# "text_zho_Hant" +# "text_zsm_Latn" +# "text_zul_Latn" +# ) + # Add url with dataset here: url="https://huggingface.co/datasets/muhammadravi251001/restructured-flores200/tree/main/data" @@ -29,5 +231,4 @@ for lang in "${lang_array[@]}"; do --include_keys "$lang" \ --value_prefix $'\n' \ --output_text_file "$lang".txt - done diff --git a/data/flores200-res/graphs.sh b/data/flores200-res/graphs.sh new file mode 100644 index 0000000000..aabf067fda --- /dev/null +++ b/data/flores200-res/graphs.sh @@ -0,0 +1,9 @@ +#!/bin/bash +python3 plot_langscript_sizes_grouped.py --group-by script --color-by script --out by_script.png +python3 plot_langscript_sizes_grouped.py --group-by script --color-by region --out by_region_script.png +python3 plot_langscript_sizes_grouped.py --group-by region --color-by region --out by_region.png +python3 plot_langscript_sizes_grouped.py --group-by family --color-by family --out by_family.png +python3 plot_langscript_sizes_grouped.py --group-by family --color-by script --out by_family_script.png + + + diff --git a/data/flores200-res/ipa_scripts.sh b/data/flores200-res/ipa_scripts.sh new file mode 100644 index 0000000000..ead45ced52 --- /dev/null +++ b/data/flores200-res/ipa_scripts.sh @@ -0,0 +1,23 @@ +# include tokenized comparison (uses tokenized_sizes["tiktoken"] from filtered_scripts.json) +# python3 plot_ipa_vs_text.py \ +# --text-dir text --ipa-dir ipa \ +# --filtered-json filtered_files.json \ +# --tok-method tiktoken + +# # save everything to plots_out/ +# python3 plot_ipa_vs_text.py \ +# --text-dir text --ipa-dir ipa \ +# --filtered-json filtered_scripts.json \ +# --tok-method tiktoken \ +# --save --outdir plots_out --csv + +# only keep languages that have tiktoken sizes +# python3 plot_ipa_vs_text.py \ +# --text-dir text --ipa-dir ipa \ +# --filtered-json filtered_tiles.json \ +# --tok-method tiktoken \ +# --skip-missing-tok + + +python3 plot_ipa_vs_text.py --text-dir text --ipa-dir ipa --save --outdir plots_out --csv + diff --git a/data/flores200-res/ja_stats.json b/data/flores200-res/ja_stats.json new file mode 100644 index 0000000000..30b83d8af6 --- /dev/null +++ b/data/flores200-res/ja_stats.json @@ -0,0 +1,7 @@ +{ + "transcribed_bytes": 305426, + "not_transcribed_bytes": 21596, + "total_bytes": 327022, + "pct_transcribed": 93.3961629492817, + "pct_not_transcribed": 6.6038370507183 +} \ No newline at end of file diff --git a/data/flores200-res/ko_stats.json b/data/flores200-res/ko_stats.json new file mode 100644 index 0000000000..5330f63aca --- /dev/null +++ b/data/flores200-res/ko_stats.json @@ -0,0 +1,7 @@ +{ + "transcribed_bytes": 271690, + "not_transcribed_bytes": 8833, + "total_bytes": 280523, + "pct_transcribed": 96.85123857936783, + "pct_not_transcribed": 3.148761420632176 +} \ No newline at end of file diff --git a/data/flores200-res/phoneticize.sh b/data/flores200-res/phoneticize.sh index 007c034696..2552a922ff 100644 --- a/data/flores200-res/phoneticize.sh +++ b/data/flores200-res/phoneticize.sh @@ -1,7 +1,25 @@ #!/bin/bash -python3 utils/en2ipa.py text_eng_Latn.txt --mode text --output_file ipa_text_eng_Latn.txt -python3 utils/ja2ipa.py text_jpn_Jpan.txt ipa_text_jpn_Jpan.txt --text_output --use_spacy --text_no_sentence -python3 utils/ko_en_to_ipa.py text_kor_Hang.txt --text_input --text_output ipa_text_kor_Hang.txt -python3 utils/zh_to_ipa.py text_zho_Hans.txt ipa_text_zho_Hans.txt --input_type text --no-wrapper +# python3 utils/en2ipa.py text_eng_Latn.txt --mode text --output_file ipa_text_eng_Latn.txt --no-wrapper --stats_json eng_stats.json +# python3 utils/ja2ipa.py text_jpn_Jpan.txt ipa_text_jpn_Jpan.txt --text_output --use_spacy --text_no_sentence --stats_json ja_stats.json +# python3 utils/ko_en_to_ipa.py text_kor_Hang.txt --text_input --text_output ipa_text_kor_Hang.txt --stats_json ko_stats.json +# python3 utils/zh_to_ipa.py text_zho_Hans.txt ipa_text_zho_Hans.txt --input_type text --no-wrapper --stats_json zh_stats.json + +lang_array=( + "text_vie_Latn:vi" + "text_ind_Latn:id" + "text_swh_Latn:sw" + "text_ell_Grek:el" + "text_fra_Latn:fr" + "text_yue_Hant:yue" +) + +for lang in "${lang_array[@]}"; do + text_file="${lang%%:*}" + two_letter_code="${lang##*:}" + echo "${text_file}; ${two_letter_code}" + if [ ! -f "ipa_${text_file}.txt" ]; then + python3 utils/espeak2ipa.py "$text_file".txt --mode text --output_file ipa_"$text_file".txt --no-wrapper --stats_json stats_"$text_file".json --lang "$two_letter_code" --text_no_sentence + fi +done diff --git a/data/flores200-res/plot_ipa_vs_text.py b/data/flores200-res/plot_ipa_vs_text.py new file mode 100644 index 0000000000..5c0e246466 --- /dev/null +++ b/data/flores200-res/plot_ipa_vs_text.py @@ -0,0 +1,529 @@ +#!/usr/bin/env python3 +""" +plot_ipa_vs_text.py + +Graphs IPA vs raw text sizes for paired files across directories: + + text/ + ipa/ + +Defaults: + --text-dir text/ + --ipa-dir ipa/ + +Tokenization: +- Can also load tokenized sizes (e.g. tiktoken) from filtered_scripts.json (or other) + produced by your tokenize_and_annotate_sizes.py pipeline, and plot: + + raw_bytes vs ipa_bytes vs tok_bytes + +Assumptions for filtered JSON rows: + - list[dict] + - key "lang_script" OR ("language"+"_"+"script") matches the part of text_.txt + - key "tokenized_sizes" is a dict like {"tiktoken": , ...} + +Produces (same as before): +- scatter: IPA bytes vs raw bytes +- bar: IPA/raw ratio +- bar: delta bytes (IPA - raw) + +Additionally (if filtered json provided & matches are found): +- grouped bar: Raw vs IPA vs Tokenized (bytes) per language + +""" + +from __future__ import annotations + +import argparse +import csv +from dataclasses import dataclass +from pathlib import Path +import math +import json + +from typing import Dict, List, Optional, Tuple + +import matplotlib.pyplot as plt + + +@dataclass +class PairStats: + lang: str + raw_path: Path + ipa_path: Path + raw_bytes: int + ipa_bytes: int + raw_chars: int + ipa_chars: int + raw_lines: int + ipa_lines: int + tok_bytes: Optional[int] = None # NEW + + @property + def ratio_bytes(self) -> float: + return (self.ipa_bytes / self.raw_bytes) if self.raw_bytes else float("inf") + + @property + def delta_bytes(self) -> int: + return self.ipa_bytes - self.raw_bytes + + +def read_stats(p: Path) -> Tuple[int, int, int]: + """ + Returns (utf8_bytes, chars, lines). + """ + data = p.read_text(encoding="utf-8", errors="replace") + b = len(data.encode("utf-8")) + c = len(data) + lines = data.count("\n") + (1 if data and not data.endswith("\n") else 0) + return b, c, lines + + +def discover_pairs(text_dir: Path, ipa_dir: Path) -> List[Tuple[str, Path, Path]]: + """ + Finds pairs across directories: + text_dir/text_.txt + ipa_dir/ipa_text_.txt + + Note: can be "eng_Latn" etc; we treat it as an opaque key. + """ + raw_map: Dict[str, Path] = {} + ipa_map: Dict[str, Path] = {} + + for p in text_dir.iterdir(): + if p.is_file() and p.name.startswith("text_") and p.name.endswith(".txt"): + lang = p.name[len("text_") : -len(".txt")] + raw_map[lang] = p + + for p in ipa_dir.iterdir(): + if p.is_file() and p.name.startswith("ipa_text_") and p.name.endswith(".txt"): + lang = p.name[len("ipa_text_") : -len(".txt")] + ipa_map[lang] = p + + langs = sorted(set(raw_map) & set(ipa_map)) + return [(lang, raw_map[lang], ipa_map[lang]) for lang in langs] + + +def _load_tokenized_kb_map(filtered_json: Path, method: str) -> Dict[str, float]: + """ + Returns: { lang_script_key -> tokenized_size_kb } for the chosen method. + + Expects rows like: + { + "lang_script": "eng_Latn", + "tokenized_sizes": {"tiktoken": 300.0}, + ... + } + """ + if not filtered_json.exists(): + raise FileNotFoundError(f"filtered json not found: {filtered_json}") + + rows = json.loads(filtered_json.read_text(encoding="utf-8")) + if not isinstance(rows, list): + raise ValueError("filtered json must be a list of objects") + + out: Dict[str, float] = {} + for r in rows: + if not isinstance(r, dict): + continue + + key = r.get("lang_script") + if not key: + # try reconstruct + lang = r.get("language") + script = r.get("script") + if lang and script: + key = f"{lang}_{script}" + + if not key: + continue + + tok_map = r.get("tokenized_sizes") + if not isinstance(tok_map, dict): + continue + + v = tok_map.get(method) + if v is None: + continue + + try: + out[str(key)] = float(v) # KB + except Exception: + continue + + return out + + +def make_scatter(stats: List[PairStats], outpath: Optional[Path], title: str) -> None: + x = [s.raw_bytes for s in stats] + y = [s.ipa_bytes for s in stats] + labels = [s.lang for s in stats] + + plt.figure() + plt.scatter(x, y) + + for xi, yi, lab in zip(x, y, labels): + plt.annotate(lab, (xi, yi), textcoords="offset points", xytext=(6, 4)) + + plt.xlabel("Raw text size (UTF-8 bytes)") + plt.ylabel("IPA text size (UTF-8 bytes)") + plt.title(title) + plt.grid(True, linestyle="--", linewidth=0.5) + + if outpath: + plt.tight_layout() + plt.savefig(outpath, dpi=200) + plt.close() + else: + plt.show() + + +def make_bar( + stats: List[PairStats], + values: List[float], + ylabel: str, + outpath: Optional[Path], + title: str, +) -> None: + langs = [s.lang for s in stats] + plt.figure(figsize=(max(8, 0.8 * len(langs)), 5)) + plt.bar(langs, values) + plt.ylabel(ylabel) + plt.title(title) + plt.xticks(rotation=35, ha="right") + plt.grid(True, axis="y", linestyle="--", linewidth=0.5) + + if outpath: + plt.tight_layout() + plt.savefig(outpath, dpi=200) + plt.close() + else: + plt.show() + + +def _mean_std(vals: List[float]) -> Tuple[float, float]: + """ + Population mean/std (ddof=0) over vals. + """ + if not vals: + return 0.0, 0.0 + m = sum(vals) / len(vals) + var = sum((v - m) ** 2 for v in vals) / len(vals) + return m, math.sqrt(var) + + +def make_back_to_back_bar( + stats: List[PairStats], + outpath: Optional[Path], + title: str = "Raw vs IPA Text Size (UTF-8 bytes)", +) -> None: + """ + Back-to-back horizontal bar chart: + - Raw text on the left (negative) + - IPA text on the right (positive) + + Adds: + - dotted mean lines for raw and ipa + - dotted ±1 stddev lines for raw and ipa + """ + langs = [s.lang for s in stats] + raw_bytes = [float(s.raw_bytes) for s in stats] + ipa_bytes = [float(s.ipa_bytes) for s in stats] + + raw_vals = [-b for b in raw_bytes] # negative for left side + ipa_vals = ipa_bytes # positive for right side + + raw_mean, raw_std = _mean_std(raw_bytes) + ipa_mean, ipa_std = _mean_std(ipa_bytes) + + y = range(len(langs)) + + plt.figure(figsize=(10, max(5, 0.5 * len(langs)))) + plt.barh(y, raw_vals, label="Raw text", alpha=0.7) + plt.barh(y, ipa_vals, label="IPA text", alpha=0.7) + + plt.yticks(y, langs) + plt.axvline(0, color="black", linewidth=1) + + # Mean lines (dotted) + plt.axvline(-raw_mean, linestyle=":", linewidth=2, label=f"Raw mean ({raw_mean:.0f})") + plt.axvline(ipa_mean, linestyle=":", linewidth=2, label=f"IPA mean ({ipa_mean:.0f})") + + # ±1 stddev lines (dotted, lighter) + plt.axvline(-(raw_mean - raw_std), linestyle=":", linewidth=1) + plt.axvline(-(raw_mean + raw_std), linestyle=":", linewidth=1) + plt.axvline(ipa_mean - ipa_std, linestyle=":", linewidth=1) + plt.axvline(ipa_mean + ipa_std, linestyle=":", linewidth=1) + + plt.xlabel("UTF-8 bytes") + plt.title( + f"{title}\n" + f"Raw mean={raw_mean:.0f}, std={raw_std:.0f} | " + f"IPA mean={ipa_mean:.0f}, std={ipa_std:.0f}" + ) + plt.grid(True, axis="x", linestyle="--", linewidth=0.5) + + max_val = max(max(ipa_vals), max(abs(v) for v in raw_vals)) + max_val = max(max_val, raw_mean + raw_std, ipa_mean + ipa_std) + plt.xlim(-max_val * 1.15, max_val * 1.15) + + plt.legend(loc="best") + + if outpath: + plt.tight_layout() + plt.savefig(outpath, dpi=200) + plt.close() + else: + plt.show() + + print(f"[back-to-back] Raw bytes: mean={raw_mean:.2f}, std={raw_std:.2f}") + print(f"[back-to-back] IPA bytes: mean={ipa_mean:.2f}, std={ipa_std:.2f}") + + +def make_grouped_raw_ipa_tok( + stats: List[PairStats], + outpath: Optional[Path], + tok_label: str, + title: str = "Raw vs IPA vs Tokenized Size (UTF-8 bytes)", +) -> None: + """ + Grouped (clustered) vertical bar chart per language: + raw, ipa, tok (if present) + + If some rows are missing tok_bytes, we simply omit that bar for that language. + """ + langs = [s.lang for s in stats] + raw = [s.raw_bytes for s in stats] + ipa = [s.ipa_bytes for s in stats] + tok = [s.tok_bytes for s in stats] # Optional[int] + + x = list(range(len(langs))) + width = 0.25 + + plt.figure(figsize=(max(10, 0.9 * len(langs)), 5)) + + # raw and ipa always present + plt.bar([i - width for i in x], raw, width=width, label="Raw") + plt.bar([i for i in x], ipa, width=width, label="IPA") + + # tokenized: only where present + tok_x = [] + tok_y = [] + for i, v in enumerate(tok): + if v is not None: + tok_x.append(i + width) + tok_y.append(v) + if tok_x: + plt.bar(tok_x, tok_y, width=width, label=tok_label) + + plt.xticks(x, langs, rotation=35, ha="right") + plt.ylabel("Bytes (UTF-8)") + plt.title(title) + plt.grid(True, axis="y", linestyle="--", linewidth=0.5) + plt.legend(loc="best") + + plt.tight_layout() + if outpath: + plt.savefig(outpath, dpi=200) + plt.close() + else: + plt.show() + + +def write_csv(stats: List[PairStats], out_csv: Path) -> None: + out_csv.parent.mkdir(parents=True, exist_ok=True) + with out_csv.open("w", newline="", encoding="utf-8") as f: + w = csv.writer(f) + w.writerow( + [ + "lang", + "raw_path", + "ipa_path", + "raw_bytes", + "ipa_bytes", + "ratio_bytes", + "delta_bytes", + "raw_chars", + "ipa_chars", + "raw_lines", + "ipa_lines", + "tok_bytes", + ] + ) + for s in stats: + w.writerow( + [ + s.lang, + str(s.raw_path), + str(s.ipa_path), + s.raw_bytes, + s.ipa_bytes, + f"{s.ratio_bytes:.6f}", + s.delta_bytes, + s.raw_chars, + s.ipa_chars, + s.raw_lines, + s.ipa_lines, + "" if s.tok_bytes is None else s.tok_bytes, + ] + ) + + +def main() -> None: + ap = argparse.ArgumentParser(description="Graph IPA vs raw text sizes across folders.") + ap.add_argument("--text-dir", default="text", help="Directory with text_.txt files (default: text/)") + ap.add_argument("--ipa-dir", default="ipa", help="Directory with ipa_text_.txt files (default: ipa/)") + ap.add_argument("--save", action="store_true", help="Save plots instead of showing them.") + ap.add_argument("--outdir", default="plots_out", help="Output directory when --save is used.") + ap.add_argument("--csv", action="store_true", help="Also write CSV of statistics.") + ap.add_argument( + "--sort", + choices=["lang", "raw_bytes", "ipa_bytes", "ratio", "delta"], + default="lang", + help="Sort order for plots.", + ) + ap.add_argument( + "--title", + default="IPA vs Raw Text Size (UTF-8 bytes)", + help="Title for scatter plot.", + ) + + # NEW: load tokenized sizes (KB) from filtered json and compare in bytes + ap.add_argument( + "--filtered-json", + default=None, + help="Optional: filtered_scripts.json (annotated) containing tokenized_sizes (KB). " + "If set, we will add a Raw vs IPA vs Tokenized plot.", + ) + ap.add_argument( + "--tok-method", + default="tiktoken", + help="Which tokenized_sizes[method] to load from filtered json (default: tiktoken).", + ) + ap.add_argument( + "--skip-missing-tok", + action="store_true", + help="If set, drop languages that don't have tokenized_sizes[tok-method]. " + "Default: keep language but omit tok bar.", + ) + + args = ap.parse_args() + + text_dir = Path(args.text_dir) + ipa_dir = Path(args.ipa_dir) + + if not text_dir.exists(): + raise SystemExit(f"text-dir not found: {text_dir}") + if not ipa_dir.exists(): + raise SystemExit(f"ipa-dir not found: {ipa_dir}") + + pairs = discover_pairs(text_dir, ipa_dir) + if not pairs: + raise SystemExit("No matching text_/ipa_text_ pairs found.") + + tok_kb_map: Dict[str, float] = {} + if args.filtered_json: + tok_kb_map = _load_tokenized_kb_map(Path(args.filtered_json), method=args.tok_method) + + stats: List[PairStats] = [] + for lang, raw_p, ipa_p in pairs: + rb, rc, rl = read_stats(raw_p) + ib, ic, il = read_stats(ipa_p) + + tok_bytes: Optional[int] = None + if tok_kb_map: + kb = tok_kb_map.get(lang) + if kb is not None: + tok_bytes = int(round(kb * 1024.0)) + + # optionally drop missing tokenized values + if args.skip_missing_tok and tok_kb_map and tok_bytes is None: + continue + + stats.append( + PairStats( + lang=lang, + raw_path=raw_p, + ipa_path=ipa_p, + raw_bytes=rb, + ipa_bytes=ib, + raw_chars=rc, + ipa_chars=ic, + raw_lines=rl, + ipa_lines=il, + tok_bytes=tok_bytes, + ) + ) + + # Sorting + key_map = { + "lang": lambda s: s.lang, + "raw_bytes": lambda s: s.raw_bytes, + "ipa_bytes": lambda s: s.ipa_bytes, + "ratio": lambda s: s.ratio_bytes, + "delta": lambda s: s.delta_bytes, + } + stats.sort(key=key_map[args.sort]) + + outdir = Path(args.outdir) + if args.save: + outdir.mkdir(parents=True, exist_ok=True) + + make_scatter( + stats, + outdir / "scatter_ipa_vs_raw_bytes.png" if args.save else None, + args.title, + ) + + make_bar( + stats, + [s.ratio_bytes for s in stats], + "IPA / Raw (bytes)", + outdir / "bar_ratio_ipa_over_raw.png" if args.save else None, + "IPA expansion ratio by language", + ) + + make_bar( + stats, + [float(s.delta_bytes) for s in stats], + "IPA - Raw (bytes)", + outdir / "bar_delta_ipa_minus_raw.png" if args.save else None, + "Absolute size increase (IPA − Raw)", + ) + + make_back_to_back_bar( + stats, + outdir / "bar_back_to_back_raw_vs_ipa.png" if args.save else None, + title="Raw vs IPA Text Size by Language (UTF-8 bytes)", + ) + + # NEW: grouped raw vs ipa vs tokenized (if filtered_json provided and any matches exist) + if tok_kb_map: + any_tok = any(s.tok_bytes is not None for s in stats) + if any_tok: + make_grouped_raw_ipa_tok( + stats, + outdir / f"bar_grouped_raw_ipa_{args.tok_method}.png" if args.save else None, + tok_label=args.tok_method, + title=f"Raw vs IPA vs {args.tok_method} (bytes)", + ) + else: + print(f"[warn] --filtered-json provided but no tokenized_sizes['{args.tok_method}'] matched your lang keys.") + + if args.save and args.csv: + write_csv(stats, outdir / "ipa_vs_raw_stats.csv") + + for s in stats: + tok_str = "n/a" if s.tok_bytes is None else str(s.tok_bytes) + print( + f"{s.lang:14s} raw={s.raw_bytes:8d} " + f"ipa={s.ipa_bytes:8d} " + f"tok({args.tok_method})={tok_str:>8s} " + f"ratio={s.ratio_bytes:6.3f} " + f"delta={s.delta_bytes:8d}" + ) + + +if __name__ == "__main__": + main() + diff --git a/data/flores200-res/plot_langscript_sizes_grouped.py b/data/flores200-res/plot_langscript_sizes_grouped.py new file mode 100644 index 0000000000..c1462018c1 --- /dev/null +++ b/data/flores200-res/plot_langscript_sizes_grouped.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +import json +import re +from collections import Counter, defaultdict +import matplotlib.pyplot as plt + +# text__ + + + +
+

{title}

+ Pick a token to update per-file counts; heatmap shows file similarity via high-frequency vocab. +
+ +
+
+

Vocab + total frequency (directory aggregate)

+
+ + + + +
+
+
+ Showing top {len(token_rows)} tokens by frequency. +
+
+ +
+

Per-file counts + similarity heatmap

+
+
+
+
+
+ Heatmap uses TF-IDF over high-frequency SentencePiece tokens and cosine similarity. +
+
+
+ + + + +""" + out_path.write_text(html, encoding="utf-8") + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--vocab", default="trained_spm_model.vocab", + help="SentencePiece vocab file (default: trained_spm_model.vocab). Used to infer .model if --model not given.") + ap.add_argument("--model", default=None, + help="SentencePiece model file (.model). If omitted, inferred from --vocab by replacing .vocab -> .model.") + ap.add_argument("--dir", required=True, + help="Directory of text files to scan.") + ap.add_argument("--recursive", action="store_true", + help="Recurse into subdirectories (default: false).") + ap.add_argument("--suffixes", default=".txt", + help="Comma-separated suffixes to include (default: .txt). Example: .txt,.md") + + ap.add_argument("--top-k", type=int, default=1500, + help="Embed only top-K tokens by total frequency into the HTML (default: 1500).") + ap.add_argument("--min-count", type=int, default=1, + help="Only consider tokens with total count >= this (default: 1).") + ap.add_argument("--out", default="vocab_freq_dashboard.html", + help="Output HTML path (default: vocab_freq_dashboard.html)") + + # NEW: heatmap controls (no dendro mode; just heatmap) + ap.add_argument("--heatmap", action="store_true", + help="Compute and embed a file similarity heatmap (requires numpy).") + ap.add_argument("--heatmap-top-k", type=int, default=300, + help="Use top-K frequent tokens (from the directory) as features for TF-IDF similarity (default: 300).") + ap.add_argument("--heatmap-reorder", action="store_true", + help="Reorder files to group similar ones (simple greedy heuristic, no SciPy).") + + args = ap.parse_args() + + vocab_path = Path(args.vocab) + model_path = Path(args.model) if args.model else infer_model_path_from_vocab(vocab_path) + root = Path(args.dir) + out = Path(args.out) + + if not model_path.exists(): + raise SystemExit(f"SentencePiece model not found: {model_path} (pass --model or ensure it matches --vocab)") + if not root.exists() or not root.is_dir(): + raise SystemExit(f"Directory not found: {root}") + + suffixes = tuple(s.strip().lower() for s in args.suffixes.split(",") if s.strip()) + files = iter_text_files(root, recursive=args.recursive, suffixes=suffixes) + if not files: + raise SystemExit(f"No files found in {root} with suffixes={suffixes} (try --recursive or --suffixes)") + + sp = spm.SentencePieceProcessor(model_file=str(model_path)) + + print(f"[info] model: {model_path}") + print(f"[info] vocab size: {sp.get_piece_size()}") + print(f"[info] scanning {len(files)} files under: {root}") + + total = Counter() + per_file: Dict[str, Counter] = {} + file_names: List[str] = [] + + for p in files: + rel = str(p.relative_to(root)) + file_names.append(rel) + c = count_tokens_in_file(sp, p) + per_file[rel] = c + total.update(c) + + # Token UI: top tokens by directory frequency + items = [(tid, cnt) for tid, cnt in total.items() if cnt >= args.min_count] + items.sort(key=lambda x: x[1], reverse=True) + if not items: + raise SystemExit(f"No tokens met min-count={args.min_count} (unexpected).") + + top_items = items[: max(1, args.top_k)] + + token_rows: List[Dict] = [] + per_file_counts: Dict[str, Dict[str, int]] = {} + + for tid, cnt in top_items: + tok = human_token(sp.id_to_piece(int(tid))) + token_rows.append({"id": int(tid), "token": tok, "count": int(cnt)}) + + for tid, _ in top_items: + tid = int(tid) + k = str(tid) + per_file_counts[k] = {} + for fn in file_names: + v = per_file[fn].get(tid, 0) + if v: + per_file_counts[k][fn] = int(v) + + default_token_id = int(top_items[0][0]) + + heatmap_payload: Optional[Dict] = None + if args.heatmap: + # Features for similarity + feat_tok_ids = [int(tid) for tid, _ in items[: max(2, args.heatmap_top_k)]] + print(f"[info] heatmap features: top {len(feat_tok_ids)} tokens (TF-IDF)") + try: + heatmap_payload = _build_heatmap_payload( + file_names=file_names, + per_file=per_file, + token_ids_for_heatmap=feat_tok_ids, + reorder=args.heatmap_reorder, + ) + except Exception as e: + heatmap_payload = {"ok": False, "reason": f"Failed to build heatmap: {e!r}"} + print(f"[warn] heatmap failed: {e!r}") + + title = f"SentencePiece token frequency dashboard ({root.name})" + build_html( + title=title, + token_rows=token_rows, + per_file_counts=per_file_counts, + file_order=file_names, + default_token_id=default_token_id, + heatmap_payload=heatmap_payload, + out_path=out, + ) + + print(f"[done] wrote: {out}") + print("[note] Uses Plotly CDN for interactivity; open the HTML in your browser.") + if args.heatmap and (np is None): + print("[note] Install heatmap deps: python3 -m pip install numpy") + + +if __name__ == "__main__": + main() + diff --git a/data/flores200-res/tokenization_vs_origina.sh b/data/flores200-res/tokenization_vs_origina.sh new file mode 100644 index 0000000000..03b7ab75ac --- /dev/null +++ b/data/flores200-res/tokenization_vs_origina.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +python3 plot_tokenization_vs_original.py \ + --json filtered_files.json \ + --mode ratio \ + --method tiktoken \ + --group-by family --color-by family \ + --out ratio_family.png + +python3 plot_tokenization_vs_original.py \ + --json filtered_files.json \ + --mode tokenized_kb --method tiktoken \ + --group-by region --color-by region \ + --out tok_kb_region.png + +python3 plot_tokenization_vs_original.py \ + --json filtered_files.json \ + --mode ratio --method tiktoken \ + --group-by script --color-by script \ + --out ratio_script.png + diff --git a/data/flores200-res/tokenize.sh b/data/flores200-res/tokenize.sh new file mode 100644 index 0000000000..634bf10554 --- /dev/null +++ b/data/flores200-res/tokenize.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +python3 tokenize_and_annotate_sizes.py \ + --in-json filtered_files.json \ + --method tiktoken \ + --tiktoken-encoding gpt2 + diff --git a/data/flores200-res/tokenize_and_annotate_sizes.py b/data/flores200-res/tokenize_and_annotate_sizes.py new file mode 100644 index 0000000000..e39d834899 --- /dev/null +++ b/data/flores200-res/tokenize_and_annotate_sizes.py @@ -0,0 +1,254 @@ +#!/usr/bin/env python3 +""" +tokenize_and_annotate_sizes.py + +Reads the *filtered* JSON produced by filter_files_by_script.py (one entry per file), +runs prepare.py (assumed symlinked as ./prepare.py) to tokenize each text file with +100% train split (no val), writes: + + text__