From 45fbaec537b22f9ded2a50d3ec7a05ed80ff52ae Mon Sep 17 00:00:00 2001 From: klei22 Date: Tue, 23 Dec 2025 18:49:53 -0800 Subject: [PATCH 01/10] Update script to emit stats and expand targets Each script now has a common argument for stats_json, which emits the number of tokens not transcribed (those which will be held for byte tokenization). Added an espeak2ipa.py script which can target any of the espeak languages, and defaulting this to target shan for now. --- data/flores200-res/phoneticize.sh | 10 +- data/template/utils/en2ipa.py | 188 +++++++++++-- data/template/utils/espeak2ipa.py | 406 ++++++++++++++++++++++++++++ data/template/utils/ja2ipa.py | 172 ++++++------ data/template/utils/ko_en_to_ipa.py | 138 ++++++++-- data/template/utils/zh_to_ipa.py | 168 ++++++++---- 6 files changed, 886 insertions(+), 196 deletions(-) create mode 100644 data/template/utils/espeak2ipa.py diff --git a/data/flores200-res/phoneticize.sh b/data/flores200-res/phoneticize.sh index 007c034696..b52f0c2789 100644 --- a/data/flores200-res/phoneticize.sh +++ b/data/flores200-res/phoneticize.sh @@ -1,7 +1,9 @@ #!/bin/bash -python3 utils/en2ipa.py text_eng_Latn.txt --mode text --output_file ipa_text_eng_Latn.txt -python3 utils/ja2ipa.py text_jpn_Jpan.txt ipa_text_jpn_Jpan.txt --text_output --use_spacy --text_no_sentence -python3 utils/ko_en_to_ipa.py text_kor_Hang.txt --text_input --text_output ipa_text_kor_Hang.txt -python3 utils/zh_to_ipa.py text_zho_Hans.txt ipa_text_zho_Hans.txt --input_type text --no-wrapper +python3 utils/en2ipa.py text_eng_Latn.txt --mode text --output_file ipa_text_eng_Latn.txt --no-wrapper --stats_json eng_stats.json +python3 utils/ja2ipa.py text_jpn_Jpan.txt ipa_text_jpn_Jpan.txt --text_output --use_spacy --text_no_sentence --stats_json ja_stats.json +python3 utils/ko_en_to_ipa.py text_kor_Hang.txt --text_input --text_output ipa_text_kor_Hang.txt --stats_json ko_stats.json +python3 utils/zh_to_ipa.py text_zho_Hans.txt ipa_text_zho_Hans.txt --input_type text --no-wrapper --stats_json zh_stats.json +python3 utils/espeak2ipa.py text_shn_Mymr.txt --mode text --output_file ipa_text_shan.txt --no-wrapper --stats_json shan_stats.json --lang shan + diff --git a/data/template/utils/en2ipa.py b/data/template/utils/en2ipa.py index c6b9c84973..14708f1cc7 100644 --- a/data/template/utils/en2ipa.py +++ b/data/template/utils/en2ipa.py @@ -1,11 +1,11 @@ +#!/usr/bin/env python3 # data/template/utils/en2ipa.py import subprocess -from konlpy.tag import Okt import argparse import re import json -from typing import List, Tuple +from typing import List, Tuple, Optional, Dict, Any from rich.progress import Progress, BarColumn, TextColumn, TimeRemainingColumn, TimeElapsedColumn, MofNCompleteColumn from concurrent.futures import ThreadPoolExecutor, as_completed import os @@ -14,6 +14,21 @@ counter = 0 counter_lock = threading.Lock() +WRAP_PREFIX = "[[[[[" +WRAP_SUFFIX = "]]]]]" + +_WORD_RE = re.compile(r'\w+|[^\w\s]', re.UNICODE) + + +def utf8_len(s: str) -> int: + return len(s.encode("utf-8")) + + +def is_english_token(tok: str) -> bool: + # Matches your original intent: “contains any a-z letter” + return any('a' <= ch.lower() <= 'z' for ch in tok) + + def transcribe_english(sentence, wrapper=False): """Transcribe an English sentence into its phonemes using espeak.""" try: @@ -24,28 +39,28 @@ def transcribe_english(sentence, wrapper=False): ) transcription = result.stdout.strip().replace("ㆍ", " ") if "(en)" in transcription: - return f"[[[[[{sentence}]]]]]" if wrapper else sentence + return f"{WRAP_PREFIX}{sentence}{WRAP_SUFFIX}" if wrapper else sentence return transcription except Exception as e: return f"Error in transcribing English: {str(e)}" + def handle_mixed_language(word, wrapper=False): """Handle a word with potential English, Language, or number content.""" global counter if word.isdigit(): return word - elif any('a' <= char.lower() <= 'z' for char in word): + elif is_english_token(word): return transcribe_english(word, wrapper=wrapper) else: if wrapper: - return "[[[[[" + word + "]]]]]" + return f"{WRAP_PREFIX}{word}{WRAP_SUFFIX}" else: - # thread-safe increment + # thread-safe increment (your existing stat) with counter_lock: counter += 1 return word -_WORD_RE = re.compile(r'\w+|[^\w\s]', re.UNICODE) def transcribe_tokens_to_string(tokens: List[str], wrapper: bool) -> str: result = [] @@ -56,11 +71,33 @@ def transcribe_tokens_to_string(tokens: List[str], wrapper: bool) -> str: result.append(tok) return " ".join(result) -def _worker_sentence(sentence: str, wrapper: bool) -> str: - """Worker function: tokenize and transcribe one sentence/line.""" + +def _worker_sentence(sentence: str, wrapper: bool, stats: Optional[Dict[str, int]] = None) -> str: + """ + Worker function: tokenize and transcribe one sentence/line. + If stats is provided, updates: + - transcribed_bytes: UTF-8 bytes of ORIGINAL tokens that were transcribed (English tokens) + - not_transcribed_bytes: UTF-8 bytes of ORIGINAL tokens not transcribed (digits, punctuation, non-English words) + Counts are based on ORIGINAL tokens, so wrapper overhead is excluded automatically. + """ tokens = _WORD_RE.findall(sentence) + + if stats is not None: + for tok in tokens: + b = utf8_len(tok) + if re.match(r'\w+', tok): + if tok.isdigit(): + stats["not_transcribed_bytes"] += b + elif is_english_token(tok): + stats["transcribed_bytes"] += b + else: + stats["not_transcribed_bytes"] += b + else: + stats["not_transcribed_bytes"] += b + return transcribe_tokens_to_string(tokens, wrapper=wrapper) + def _progress() -> Progress: return Progress( TextColumn("[bold blue]{task.description}"), @@ -72,8 +109,16 @@ def _progress() -> Progress: transient=False, ) -def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa', wrapper=False, - multithread: bool = False, workers: int = 0): + +def transcribe_multilingual( + sentences, + input_json_key=None, + output_json_key='ipa', + wrapper=False, + multithread: bool = False, + workers: int = 0, + stats: Optional[Dict[str, int]] = None, +): """Transcribe multilingual sentences (JSON list mode).""" try: data = json.loads(sentences) if isinstance(sentences, str) else sentences @@ -84,6 +129,12 @@ def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa if n == 0: return json.dumps(data, ensure_ascii=False, indent=4) + if stats is None: + stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0} + else: + stats.setdefault("transcribed_bytes", 0) + stats.setdefault("not_transcribed_bytes", 0) + if not multithread or workers <= 1: # Single-threaded path (original behavior) with _progress() as progress: @@ -91,24 +142,35 @@ def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa for item in data: if input_json_key in item: sentence = item[input_json_key] - item[output_json_key] = _worker_sentence(sentence, wrapper) + item[output_json_key] = _worker_sentence(sentence, wrapper, stats=stats) progress.update(task, advance=1) else: # Multithreaded path with ordered assembly results: List[Tuple[int, str]] = [None] * n # type: ignore + # prepare jobs jobs = [] for idx, item in enumerate(data): sentence = item.get(input_json_key, "") jobs.append((idx, sentence)) + # Per-thread stats to avoid locks in hot path; merge at end + per_thread_stats: List[Dict[str, int]] = [] + + def submit_job(ex, idx_sentence): + idx, sentence = idx_sentence + local_stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0} + per_thread_stats.append(local_stats) + return ex.submit(_worker_sentence, sentence, wrapper, local_stats), idx + with _progress() as progress: task = progress.add_task(f"Processing JSON items (mt x{workers})", total=n) with ThreadPoolExecutor(max_workers=workers) as ex: - future_to_idx = { - ex.submit(_worker_sentence, sentence, wrapper): idx - for idx, sentence in jobs - } + future_to_idx = {} + for idx_sentence in jobs: + fut, idx = submit_job(ex, idx_sentence) + future_to_idx[fut] = idx + for fut in as_completed(future_to_idx): idx = future_to_idx[fut] try: @@ -118,6 +180,11 @@ def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa results[idx] = (idx, res) progress.update(task, advance=1) + # merge per-thread stats + for st in per_thread_stats: + stats["transcribed_bytes"] += st.get("transcribed_bytes", 0) + stats["not_transcribed_bytes"] += st.get("not_transcribed_bytes", 0) + # write back in original order for idx, item in enumerate(data): if input_json_key in item: @@ -129,32 +196,50 @@ def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa return json.dumps(data, ensure_ascii=False, indent=4) -def transcribe_text_lines(lines: List[str], wrapper: bool, multithread: bool = False, workers: int = 0) -> List[str]: + +def transcribe_text_lines( + lines: List[str], + wrapper: bool, + multithread: bool = False, + workers: int = 0, + stats: Optional[Dict[str, int]] = None, +) -> List[str]: """Transcribe a plain-text file line-by-line.""" n = len(lines) if n == 0: return [] + if stats is None: + stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0} + else: + stats.setdefault("transcribed_bytes", 0) + stats.setdefault("not_transcribed_bytes", 0) + if not multithread or workers <= 1: - # Single-threaded out_lines: List[str] = [] with _progress() as progress: task = progress.add_task("Processing text lines", total=n) for line in lines: raw = line.rstrip("\n") - out_lines.append(_worker_sentence(raw, wrapper)) + out_lines.append(_worker_sentence(raw, wrapper, stats=stats)) progress.update(task, advance=1) return out_lines else: - # Multithreaded with ordered assembly out_lines: List[str] = [None] * n # type: ignore + + # Per-thread stats (avoid global lock) + per_thread_stats: List[Dict[str, int]] = [None] * n # type: ignore + with _progress() as progress: task = progress.add_task(f"Processing text lines (mt x{workers})", total=n) with ThreadPoolExecutor(max_workers=workers) as ex: - future_to_idx = { - ex.submit(_worker_sentence, lines[i].rstrip("\n"), wrapper): i - for i in range(n) - } + future_to_idx = {} + for i in range(n): + local_stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0} + per_thread_stats[i] = local_stats + fut = ex.submit(_worker_sentence, lines[i].rstrip("\n"), wrapper, local_stats) + future_to_idx[fut] = i + for fut in as_completed(future_to_idx): idx = future_to_idx[fut] try: @@ -162,8 +247,45 @@ def transcribe_text_lines(lines: List[str], wrapper: bool, multithread: bool = F except Exception as e: out_lines[idx] = f"Error: {e}" progress.update(task, advance=1) + + # merge stats + for st in per_thread_stats: + stats["transcribed_bytes"] += st.get("transcribed_bytes", 0) + stats["not_transcribed_bytes"] += st.get("not_transcribed_bytes", 0) + return out_lines + +def finalize_and_print_stats(stats: Dict[str, int], stats_json_path: Optional[str] = None) -> Dict[str, Any]: + transcribed = int(stats.get("transcribed_bytes", 0)) + not_tx = int(stats.get("not_transcribed_bytes", 0)) + total = transcribed + not_tx + pct_tx = (transcribed / total * 100.0) if total else 0.0 + pct_not = (not_tx / total * 100.0) if total else 0.0 + + out_stats: Dict[str, Any] = { + "transcribed_bytes": transcribed, + "not_transcribed_bytes": not_tx, + "total_bytes": total, + "pct_transcribed": pct_tx, + "pct_not_transcribed": pct_not, + } + + print("\n=== Byte Coverage Stats (based on ORIGINAL tokens) ===") + print(f"Transcribed bytes : {out_stats['transcribed_bytes']}") + print(f"Not transcribed bytes : {out_stats['not_transcribed_bytes']}") + print(f"Total bytes (counted) : {out_stats['total_bytes']}") + print(f"% transcribed : {out_stats['pct_transcribed']:.2f}%") + print(f"% not transcribed : {out_stats['pct_not_transcribed']:.2f}%") + + if stats_json_path: + with open(stats_json_path, "w", encoding="utf-8") as sf: + json.dump(out_stats, sf, ensure_ascii=False, indent=2) + print(f"Stats JSON written to: {stats_json_path}") + + return out_stats + + def main(): parser = argparse.ArgumentParser( description='Transcribe multilingual content into IPA phonemes. Supports JSON list mode and plain-text line mode.' @@ -192,12 +314,18 @@ def main(): parser.add_argument("--workers", type=int, default=os.cpu_count() or 4, help="Number of worker threads when --multithread is enabled (default: CPU count).") + # NEW: stats output + parser.add_argument("--stats_json", type=str, default=None, + help="Optional: write byte coverage stats as JSON to this path (in addition to printing).") + args = parser.parse_args() # clamp workers if args.workers is None or args.workers < 1: args.workers = 1 + stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0} + try: if args.mode == 'json': if not args.input_json_key: @@ -210,7 +338,8 @@ def main(): args.output_json_key, wrapper=args.wrapper, multithread=args.multithread, - workers=args.workers + workers=args.workers, + stats=stats, ) if updated_json_data: with open(args.input_file, 'w', encoding='utf-8') as f: @@ -223,19 +352,24 @@ def main(): lines, wrapper=args.wrapper, multithread=args.multithread, - workers=args.workers + workers=args.workers, + stats=stats, ) target_path = args.output_file if args.output_file else args.input_file with open(target_path, 'w', encoding='utf-8') as f: f.write("\n".join(out_lines) + ("\n" if out_lines else "")) print(f"✅ Successfully wrote transcribed text to '{target_path}'") - print(f"📊 Stats: {counter} unparseable words") + finalize_and_print_stats(stats, stats_json_path=args.stats_json) + + print(f"📊 Stats: {counter} unparseable words (only counted when --no-wrapper)") + except FileNotFoundError: print(f"Error: Input file '{args.input_file}' not found.") except ValueError as ve: print(f"Error: {ve}") + if __name__ == '__main__': main() diff --git a/data/template/utils/espeak2ipa.py b/data/template/utils/espeak2ipa.py new file mode 100644 index 0000000000..9c2864afc0 --- /dev/null +++ b/data/template/utils/espeak2ipa.py @@ -0,0 +1,406 @@ +#!/usr/bin/env python3 +# espeak2ipa.py +# +# Generic IPA transcription using espeak-ng for ANY supported voice. +# Defaults to "shan" (you can override with --lang). +# +# Features (modeled after your en2ipa.py): +# - JSON list mode (--mode json): in-place update of a JSON list file +# - Text mode (--mode text): line-by-line transcription to output file (or overwrite input) +# - Optional wrapping for untranscribed/unparseable tokens: [[[[[...]]]]] +# - Multithreading with ordered output +# - Rich progress bar +# - Byte coverage stats (based on ORIGINAL tokens; wrapper overhead excluded) +# +# Notes: +# - "transcribed_bytes" counts bytes of ORIGINAL tokens we ATTEMPT to send to espeak +# (tokens that contain at least one Unicode letter). Digits/punct count as not_transcribed. +# - espeak-ng voices vary; if a voice is unavailable, you'll get an error/empty output. + +import subprocess +import argparse +import re +import json +from typing import List, Tuple, Optional, Dict, Any +from rich.progress import Progress, BarColumn, TextColumn, TimeRemainingColumn, TimeElapsedColumn, MofNCompleteColumn +from concurrent.futures import ThreadPoolExecutor, as_completed +import os +import threading + + +WRAP_PREFIX = "[[[[[" +WRAP_SUFFIX = "]]]]]" +_WORD_RE = re.compile(r"\w+|[^\w\s]", re.UNICODE) + +counter_unparseable = 0 +counter_lock = threading.Lock() + + +def utf8_len(s: str) -> int: + return len(s.encode("utf-8")) + + +def token_has_letter(tok: str) -> bool: + # "letter" across scripts (Latin, Han, Kana, Arabic, etc.) + return any(ch.isalpha() for ch in tok) + + +def transcribe_espeak(token: str, lang: str, wrapper: bool = False) -> str: + """ + Transcribe a token via espeak-ng. + If transcription fails (empty output / exception), return wrapped or original token. + """ + global counter_unparseable + try: + result = subprocess.run( + ["espeak-ng", "-q", "-v", lang, "--ipa", token], + capture_output=True, + text=True + ) + out = (result.stdout or "").strip().replace("ㆍ", " ") + if not out: + if wrapper: + return f"{WRAP_PREFIX}{token}{WRAP_SUFFIX}" + with counter_lock: + counter_unparseable += 1 + return token + return out + except Exception: + if wrapper: + return f"{WRAP_PREFIX}{token}{WRAP_SUFFIX}" + with counter_lock: + counter_unparseable += 1 + return token + + +def handle_token(tok: str, lang: str, wrapper: bool) -> str: + """ + Decide whether to transcribe: + - digits -> passthrough + - tokens with any letter -> transcribe via espeak + - otherwise (punct/symbol) -> passthrough + """ + if tok.isdigit(): + return tok + if token_has_letter(tok): + return transcribe_espeak(tok, lang=lang, wrapper=wrapper) + return tok + + +def tokens_to_ipa_string(tokens: List[str], lang: str, wrapper: bool) -> str: + out: List[str] = [] + for tok in tokens: + if re.match(r"\w+", tok): + out.append(handle_token(tok, lang=lang, wrapper=wrapper)) + else: + out.append(tok) + return " ".join(out) + + +def _worker_sentence(sentence: str, lang: str, wrapper: bool, stats: Optional[Dict[str, int]] = None) -> str: + """ + Tokenize & transcribe one sentence/line. + If stats is provided, updates byte counts based on ORIGINAL tokens: + - transcribed_bytes: tokens containing at least one letter + - not_transcribed_bytes: digits + punctuation/symbols + other \w tokens with no letters + """ + tokens = _WORD_RE.findall(sentence) + + if stats is not None: + for tok in tokens: + b = utf8_len(tok) + if re.match(r"\w+", tok): + if tok.isdigit(): + stats["not_transcribed_bytes"] += b + elif token_has_letter(tok): + stats["transcribed_bytes"] += b + else: + stats["not_transcribed_bytes"] += b + else: + stats["not_transcribed_bytes"] += b + + return tokens_to_ipa_string(tokens, lang=lang, wrapper=wrapper) + + +def _progress() -> Progress: + return Progress( + TextColumn("[bold blue]{task.description}"), + BarColumn(), + MofNCompleteColumn(), + TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), + TimeElapsedColumn(), + TimeRemainingColumn(), + transient=False, + ) + + +def transcribe_json_list( + json_text_or_obj, + input_json_key: str, + output_json_key: str, + lang: str, + wrapper: bool, + multithread: bool, + workers: int, + stats: Optional[Dict[str, int]] = None, +) -> Optional[str]: + """ + JSON list mode: reads a JSON list of objects, writes output_json_key for each object. + Returns JSON string (pretty printed). + """ + try: + data = json.loads(json_text_or_obj) if isinstance(json_text_or_obj, str) else json_text_or_obj + if not isinstance(data, list): + raise ValueError("JSON data should be a list of objects.") + except Exception as e: + print(f"Error: {e}") + return None + + n = len(data) + if stats is None: + stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0} + else: + stats.setdefault("transcribed_bytes", 0) + stats.setdefault("not_transcribed_bytes", 0) + + if n == 0: + return json.dumps(data, ensure_ascii=False, indent=4) + + if not multithread or workers <= 1: + with _progress() as progress: + task = progress.add_task("Processing JSON items", total=n) + for item in data: + if input_json_key in item: + sentence = item[input_json_key] + item[output_json_key] = _worker_sentence(sentence, lang=lang, wrapper=wrapper, stats=stats) + progress.update(task, advance=1) + else: + # ordered results + results: List[Tuple[int, str]] = [None] * n # type: ignore + per_item_stats: List[Dict[str, int]] = [None] * n # type: ignore + + jobs = [(i, data[i].get(input_json_key, "")) for i in range(n)] + + with _progress() as progress: + task = progress.add_task(f"Processing JSON items (mt x{workers})", total=n) + with ThreadPoolExecutor(max_workers=workers) as ex: + future_to_idx = {} + for idx, sentence in jobs: + local_stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0} + per_item_stats[idx] = local_stats + fut = ex.submit(_worker_sentence, sentence, lang, wrapper, local_stats) + future_to_idx[fut] = idx + + for fut in as_completed(future_to_idx): + idx = future_to_idx[fut] + try: + res = fut.result() + except Exception as e: + res = f"Error: {e}" + results[idx] = (idx, res) + progress.update(task, advance=1) + + # merge stats + for st in per_item_stats: + stats["transcribed_bytes"] += st.get("transcribed_bytes", 0) + stats["not_transcribed_bytes"] += st.get("not_transcribed_bytes", 0) + + # write back in original order + for idx, item in enumerate(data): + if input_json_key in item: + item[output_json_key] = results[idx][1] + + return json.dumps(data, ensure_ascii=False, indent=4) + + +def transcribe_text_lines( + lines: List[str], + lang: str, + wrapper: bool, + multithread: bool, + workers: int, + stats: Optional[Dict[str, int]] = None, +) -> List[str]: + """ + Text mode: input is one sentence per line. Output is one IPA line per input line. + """ + n = len(lines) + if stats is None: + stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0} + else: + stats.setdefault("transcribed_bytes", 0) + stats.setdefault("not_transcribed_bytes", 0) + + if n == 0: + return [] + + if not multithread or workers <= 1: + out_lines: List[str] = [] + with _progress() as progress: + task = progress.add_task("Processing text lines", total=n) + for line in lines: + raw = line.rstrip("\n") + out_lines.append(_worker_sentence(raw, lang=lang, wrapper=wrapper, stats=stats)) + progress.update(task, advance=1) + return out_lines + + out_lines: List[str] = [None] * n # type: ignore + per_item_stats: List[Dict[str, int]] = [None] * n # type: ignore + + with _progress() as progress: + task = progress.add_task(f"Processing text lines (mt x{workers})", total=n) + with ThreadPoolExecutor(max_workers=workers) as ex: + future_to_idx = {} + for i in range(n): + local_stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0} + per_item_stats[i] = local_stats + fut = ex.submit(_worker_sentence, lines[i].rstrip("\n"), lang, wrapper, local_stats) + future_to_idx[fut] = i + + for fut in as_completed(future_to_idx): + i = future_to_idx[fut] + try: + out_lines[i] = fut.result() + except Exception as e: + out_lines[i] = f"Error: {e}" + progress.update(task, advance=1) + + for st in per_item_stats: + stats["transcribed_bytes"] += st.get("transcribed_bytes", 0) + stats["not_transcribed_bytes"] += st.get("not_transcribed_bytes", 0) + + return out_lines + + +def finalize_and_print_stats(stats: Dict[str, int], stats_json_path: Optional[str] = None) -> Dict[str, Any]: + transcribed = int(stats.get("transcribed_bytes", 0)) + not_tx = int(stats.get("not_transcribed_bytes", 0)) + total = transcribed + not_tx + pct_tx = (transcribed / total * 100.0) if total else 0.0 + pct_not = (not_tx / total * 100.0) if total else 0.0 + + out_stats: Dict[str, Any] = { + "transcribed_bytes": transcribed, + "not_transcribed_bytes": not_tx, + "total_bytes": total, + "pct_transcribed": pct_tx, + "pct_not_transcribed": pct_not, + "unparseable_tokens": counter_unparseable, + } + + print("\n=== Byte Coverage Stats (based on ORIGINAL tokens) ===") + print(f"Transcribed bytes : {out_stats['transcribed_bytes']}") + print(f"Not transcribed bytes : {out_stats['not_transcribed_bytes']}") + print(f"Total bytes (counted) : {out_stats['total_bytes']}") + print(f"% transcribed : {out_stats['pct_transcribed']:.2f}%") + print(f"% not transcribed : {out_stats['pct_not_transcribed']:.2f}%") + print(f"Unparseable tokens : {out_stats['unparseable_tokens']}") + + if stats_json_path: + with open(stats_json_path, "w", encoding="utf-8") as sf: + json.dump(out_stats, sf, ensure_ascii=False, indent=2) + print(f"Stats JSON written to: {stats_json_path}") + + return out_stats + + +def main(): + parser = argparse.ArgumentParser( + description="Generic IPA transcription using espeak-ng for any supported voice (default: shan). " + "Supports JSON list mode and plain-text line mode, with byte coverage stats." + ) + parser.add_argument("input_file", type=str, help="Path to the input file (JSON list or plain text).") + + # Language / voice + parser.add_argument("--lang", default="shan", + help="espeak-ng voice/language code (default: shan). Example: en, fr, de, es, ja, zh, etc.") + + # Mode selection + parser.add_argument("--mode", choices=["json", "text"], default="json", + help='Processing mode. "json" expects a JSON list; "text" treats file as plain text.') + + # JSON mode params + parser.add_argument("--input_json_key", type=str, + help="JSON key to read sentences from (required for --mode json).") + parser.add_argument("--output_json_key", type=str, default="ipa", + help='JSON key to store IPA (default: "ipa").') + + # Text mode params + parser.add_argument("--output_file", type=str, default=None, + help="Output file path for text mode. Defaults to overwriting input.") + + # Wrapper option + parser.add_argument("--wrapper", default=False, action=argparse.BooleanOptionalAction, + help="Wrap unparseable tokens with [[[[[...]]]]] (default: false).") + + # Multithreading options + parser.add_argument("--multithread", default=False, action=argparse.BooleanOptionalAction, + help="Enable multithreading while preserving output order.") + parser.add_argument("--workers", type=int, default=os.cpu_count() or 4, + help="Number of worker threads when --multithread is enabled (default: CPU count).") + + # Stats output + parser.add_argument("--stats_json", type=str, default=None, + help="Optional: write byte coverage stats as JSON to this path (in addition to printing).") + + args = parser.parse_args() + + # clamp workers + if args.workers is None or args.workers < 1: + args.workers = 1 + + stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0} + + try: + if args.mode == "json": + if not args.input_json_key: + raise ValueError("--input_json_key is required when --mode json") + + with open(args.input_file, "r", encoding="utf-8") as f: + input_content = f.read() + + updated_json = transcribe_json_list( + input_content, + input_json_key=args.input_json_key, + output_json_key=args.output_json_key, + lang=args.lang, + wrapper=args.wrapper, + multithread=args.multithread, + workers=args.workers, + stats=stats, + ) + if updated_json is not None: + # matches your existing style: overwrite JSON input file + with open(args.input_file, "w", encoding="utf-8") as f: + f.write(updated_json) + print(f"✅ Successfully updated JSON data in '{args.input_file}'") + + else: + with open(args.input_file, "r", encoding="utf-8") as f: + lines = f.readlines() + + out_lines = transcribe_text_lines( + lines, + lang=args.lang, + wrapper=args.wrapper, + multithread=args.multithread, + workers=args.workers, + stats=stats, + ) + + target_path = args.output_file if args.output_file else args.input_file + with open(target_path, "w", encoding="utf-8") as f: + f.write("\n".join(out_lines) + ("\n" if out_lines else "")) + print(f"✅ Successfully wrote transcribed text to '{target_path}'") + + finalize_and_print_stats(stats, stats_json_path=args.stats_json) + + except FileNotFoundError: + print(f"Error: Input file '{args.input_file}' not found.") + except ValueError as ve: + print(f"Error: {ve}") + + +if __name__ == "__main__": + main() + diff --git a/data/template/utils/ja2ipa.py b/data/template/utils/ja2ipa.py index 513868c490..fb9ea522fd 100644 --- a/data/template/utils/ja2ipa.py +++ b/data/template/utils/ja2ipa.py @@ -367,14 +367,6 @@ def hiragana_to_ipa(text: str) -> str: # ========== 2) MeCab Morphological Tokenization ========== def mecab_spaced_reading(text: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]: - """ - Use MeCab for morphological analysis. Return four strings: - 1) spaced_original: original surface forms joined by spaces. - 2) spaced_hira_subbed: token text with the "は" particle overridden to "わ" where applicable, then converted to Hiragana. - 3) spaced_hira_original: the Hiragana conversion of the original spaced text. - 4) pos_tags: part-of-speech tags for each token (joined by spaces). - If MeCab is not available, return (None, None, None, None). - """ if not MECAB_AVAILABLE: return None, None, None, None @@ -389,10 +381,9 @@ def mecab_spaced_reading(text: str) -> Tuple[Optional[str], Optional[str], Optio surface = node.surface features = node.feature.split(",") if len(features) >= 1: - pos = features[0] # e.g. 助詞, 名詞, 動詞... + pos = features[0] tokens_original.append(surface) pos_tokens.append(pos) - # Override if particle "は" (助詞) if pos == "助詞" and surface == "は": tokens_for_hira.append("わ") else: @@ -415,21 +406,12 @@ def mecab_spaced_reading(text: str) -> Tuple[Optional[str], Optional[str], Optio # ========== 3) spaCy Morphological Tokenization ========== _spacy_nlp = None def load_spacy_japanese(): - """Lazy-load the spaCy model. Requires 'ja_core_news_sm' or similar to be installed.""" global _spacy_nlp if _spacy_nlp is None: _spacy_nlp = spacy.load("ja_core_news_sm") return _spacy_nlp def spacy_spaced_reading(text: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]: - """ - Use spaCy morphological analysis. Return four strings: - 1) spaced_original: original token texts joined by spaces. - 2) spaced_hira_subbed: token texts (with "は" overridden to "わ" when pos_ is ADP) converted to Hiragana. - 3) spaced_hira_original: Hiragana conversion of the original spaced token texts. - 4) pos_tags: part-of-speech tags (using token.pos_) joined by spaces. - If spaCy is not available, return (None, None, None, None). - """ if not SPACY_AVAILABLE: return None, None, None, None @@ -459,7 +441,6 @@ def spacy_spaced_reading(text: str) -> Tuple[Optional[str], Optional[str], Optio # ========== 4) Unified "get spaced reading" function ========== def get_spaced_reading(text: str, method: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]: - """Return (spaced_original, spaced_hira_subbed, spaced_hira_original, pos_tags) using the chosen method.""" if method == "mecab": return mecab_spaced_reading(text) elif method == "spacy": @@ -480,10 +461,6 @@ def write_text_output( include_sentence: bool = True, sep: str = "\t" ) -> None: - """ - Write a plain-text file, one line per entry. - Default format: "\\t" - """ with open(output_file, "w", encoding="utf-8") as fout: for obj in out_array: sent = obj.get("sentence", "") @@ -494,6 +471,10 @@ def write_text_output( fout.write(f"{val}\n") +def utf8_len(s: str) -> int: + return len(s.encode("utf-8")) + + # ========== 6) Main Processing Logic ========== def process_japanese_text( input_file: str, @@ -505,19 +486,16 @@ def process_japanese_text( text_field: str = "spaced_ipa", text_include_sentence: bool = True, text_sep: str = "\t", + stats_json: Optional[str] = None, ): """ - Processes Japanese text to IPA. + Same behavior as before, plus byte coverage stats: + - transcribed_bytes: UTF-8 bytes of ORIGINAL tokens that are considered "transcribed" + (anything containing Japanese script: Hiragana/Katakana/Kanji) + - not_transcribed_bytes: UTF-8 bytes of ORIGINAL tokens not transcribed (Latin, digits, punctuation) - INPUT MODES (same defaults as original): - - If json_inplace_update=True: treat input as JSON array with "sentence" fields. - - Else: treat input as plain text (one sentence per line). - - OUTPUT MODES (new): - - Default (unchanged): write JSON array to output_file - - If output_text=True: write plain text to output_file using selected field(s) + Counts are based on ORIGINAL tokens (wrapper overhead doesn't exist in this script). """ - # Decide morphological method: if use_mecab and use_spacy: print("Error: Please choose either MeCab or spaCy, not both.") sys.exit(1) @@ -528,10 +506,33 @@ def process_japanese_text( else: morph_method = None + stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0} out_array: List[Dict[str, Any]] = [] + def is_japanese_char(ch: str) -> bool: + o = ord(ch) + # Hiragana, Katakana, CJK Unified Ideographs (basic), plus common punctuation blocks are excluded intentionally. + return (0x3040 <= o <= 0x309F) or (0x30A0 <= o <= 0x30FF) or (0x4E00 <= o <= 0x9FFF) + + def is_japanese_token(tok: str) -> bool: + return any(is_japanese_char(ch) for ch in tok) + + def count_sentence_bytes(sentence: str) -> None: + # Tokenize similarly to your KR/ZH scripts for consistent accounting + toks = re.findall(r"\w+|[^\w\s]", sentence, re.UNICODE) + for tok in toks: + b = utf8_len(tok) + if re.match(r"\w+", tok): + if tok.isdigit(): + stats["not_transcribed_bytes"] += b + elif is_japanese_token(tok): + stats["transcribed_bytes"] += b + else: + stats["not_transcribed_bytes"] += b + else: + stats["not_transcribed_bytes"] += b + if json_inplace_update: - # JSON input: process as JSON array. try: with open(input_file, "r", encoding="utf-8") as fin: data = json.load(fin) @@ -541,6 +542,8 @@ def process_japanese_text( continue original_text = entry["sentence"] + count_sentence_bytes(original_text) + hira_unspaced = to_hiragana(original_text) ipa_unspaced = hiragana_to_ipa(hira_unspaced) @@ -558,11 +561,7 @@ def process_japanese_text( out_obj["spaced_original"] = spaced_original if spaced_original is not None else "" out_obj["spaced_hira_subbed"] = spaced_hira_subbed if spaced_hira_subbed is not None else "" out_obj["pos_tags"] = pos_tags if pos_tags is not None else "" - - ipa_spaced = "" - if out_obj["spaced_hira_subbed"]: - ipa_spaced = hiragana_to_ipa(out_obj["spaced_hira_subbed"]) - out_obj["spaced_ipa"] = ipa_spaced + out_obj["spaced_ipa"] = hiragana_to_ipa(out_obj["spaced_hira_subbed"]) if out_obj["spaced_hira_subbed"] else "" out_array.append(out_obj) @@ -577,7 +576,6 @@ def process_japanese_text( return else: - # Plain text input: each non-blank line is treated as a sentence. try: with open(input_file, "r", encoding="utf-8") as fin: lines = fin.readlines() @@ -588,6 +586,8 @@ def process_japanese_text( continue original_text = line + count_sentence_bytes(original_text) + hira_unspaced = to_hiragana(original_text) ipa_unspaced = hiragana_to_ipa(hira_unspaced) @@ -605,11 +605,7 @@ def process_japanese_text( out_obj["spaced_original"] = spaced_original if spaced_original is not None else "" out_obj["spaced_hira_subbed"] = spaced_hira_subbed if spaced_hira_subbed is not None else "" out_obj["pos_tags"] = pos_tags if pos_tags is not None else "" - - ipa_spaced = "" - if out_obj["spaced_hira_subbed"]: - ipa_spaced = hiragana_to_ipa(out_obj["spaced_hira_subbed"]) - out_obj["spaced_ipa"] = ipa_spaced + out_obj["spaced_ipa"] = hiragana_to_ipa(out_obj["spaced_hira_subbed"]) if out_obj["spaced_hira_subbed"] else "" out_array.append(out_obj) @@ -620,7 +616,7 @@ def process_japanese_text( print(f"An error occurred: {e}") return - # OUTPUT (default unchanged: JSON) + # OUTPUT (unchanged) if output_text: write_text_output( output_file=output_file, @@ -632,14 +628,44 @@ def process_japanese_text( else: write_json_array(output_file=output_file, out_array=out_array) + # Print + optional write stats + transcribed = int(stats["transcribed_bytes"]) + not_tx = int(stats["not_transcribed_bytes"]) + total = transcribed + not_tx + pct_tx = (transcribed / total * 100.0) if total else 0.0 + pct_not = (not_tx / total * 100.0) if total else 0.0 + + out_stats = { + "transcribed_bytes": transcribed, + "not_transcribed_bytes": not_tx, + "total_bytes": total, + "pct_transcribed": pct_tx, + "pct_not_transcribed": pct_not, + } + + print("\n=== Byte Coverage Stats (based on ORIGINAL tokens) ===") + print(f"Transcribed bytes : {out_stats['transcribed_bytes']}") + print(f"Not transcribed bytes : {out_stats['not_transcribed_bytes']}") + print(f"Total bytes (counted) : {out_stats['total_bytes']}") + print(f"% transcribed : {out_stats['pct_transcribed']:.2f}%") + print(f"% not transcribed : {out_stats['pct_not_transcribed']:.2f}%") + + if stats_json: + with open(stats_json, "w", encoding="utf-8") as sf: + json.dump(out_stats, sf, ensure_ascii=False, indent=2) + print(f"Stats JSON written to: {stats_json}") + # ========== 7) Command-Line Entry Point ========== if __name__ == "__main__": + import re # local import to avoid changing your top imports too much + parser = argparse.ArgumentParser( description=( "Convert JP text to IPA with optional morphological spacing and POS tagging.\n" "DEFAULT behavior matches original: input may be JSON (-j) or plain text, output is JSON array.\n" - "NEW: you can output plain text with --text_output." + "You can output plain text with --text_output.\n" + "NEW: prints byte coverage stats and can write them with --stats_json." ) ) parser.add_argument( @@ -658,42 +684,27 @@ def process_japanese_text( parser.add_argument( "-j", "--json_inplace_update", action="store_true", + help="Treat input file as JSON array and update each entry." ) group = parser.add_mutually_exclusive_group() - group.add_argument( - "--use_mecab", - action="store_true", - help="Use MeCab for morphological tokenization (and forcing 'は' => 'わ')." - ) - group.add_argument( - "--use_spacy", - action="store_true", - help="Use spaCy for morphological tokenization (and forcing 'は' => 'わ')." - ) - - # NEW OUTPUT MODE - parser.add_argument( - "--text_output", - action="store_true", - help="Write a plain-text output file (one line per sentence) instead of JSON." - ) - parser.add_argument( - "--text_field", - default="spaced_ipa", - help="Which field to emit in --text_output mode (default: spaced_ipa). " - "Common choices: unspaced_ipa, spaced_ipa, spaced_hira_subbed, pos_tags." - ) - parser.add_argument( - "--text_no_sentence", - action="store_true", - help="In --text_output mode, emit only the selected field (omit the original sentence)." - ) - parser.add_argument( - "--text_sep", - default="\t", - help="Separator used between sentence and field in --text_output mode (default: tab)." - ) + group.add_argument("--use_mecab", action="store_true", + help="Use MeCab for morphological tokenization (and forcing 'は' => 'わ').") + group.add_argument("--use_spacy", action="store_true", + help="Use spaCy for morphological tokenization (and forcing 'は' => 'わ').") + + parser.add_argument("--text_output", action="store_true", + help="Write a plain-text output file (one line per sentence) instead of JSON.") + parser.add_argument("--text_field", default="spaced_ipa", + help="Which field to emit in --text_output mode (default: spaced_ipa). " + "Common: unspaced_ipa, spaced_ipa, spaced_hira_subbed, pos_tags.") + parser.add_argument("--text_no_sentence", action="store_true", + help="In --text_output mode, emit only the selected field (omit the original sentence).") + parser.add_argument("--text_sep", default="\t", + help="Separator used between sentence and field in --text_output mode (default: tab).") + + parser.add_argument("--stats_json", type=str, default=None, + help="Optional: write byte coverage stats as JSON to this path (in addition to printing).") args = parser.parse_args() @@ -707,5 +718,6 @@ def process_japanese_text( text_field=args.text_field, text_include_sentence=(not args.text_no_sentence), text_sep=args.text_sep, + stats_json=args.stats_json, ) diff --git a/data/template/utils/ko_en_to_ipa.py b/data/template/utils/ko_en_to_ipa.py index 28e90963a7..4392b63a06 100644 --- a/data/template/utils/ko_en_to_ipa.py +++ b/data/template/utils/ko_en_to_ipa.py @@ -5,6 +5,17 @@ import re import json +WRAP_PREFIX = "[[[[[" +WRAP_SUFFIX = "]]]]]" + + +def utf8_len(s: str) -> int: + return len(s.encode("utf-8")) + + +def is_korean_token(token: str) -> bool: + return any('가' <= ch <= '힣' for ch in token) + def transcribe_korean(sentence, wrapper=False): """Transcribe a Korean sentence into its phonemes using KoNLPy (Okt) + espeak-ng.""" @@ -25,7 +36,7 @@ def transcribe_korean(sentence, wrapper=False): # Check for failed transcription markers if "(en)" in transcription or "(ko)" in transcription: if wrapper: - return "[[[[[" + sentence + "]]]]]" + return f"{WRAP_PREFIX}{sentence}{WRAP_SUFFIX}" return sentence return transcription @@ -37,40 +48,69 @@ def transcribe_korean(sentence, wrapper=False): def handle_mixed_language(word, wrapper=False): """Handle a word with potential Korean, other language, or number content.""" - if word.isdigit(): # Detect numbers (pass through unchanged) + if word.isdigit(): # numbers pass through unchanged return word - elif any('가' <= char <= '힣' for char in word): # Detect Korean + elif is_korean_token(word): return transcribe_korean(word, wrapper=wrapper) - else: # Non-Korean word + else: # Non-Korean if wrapper: - return "[[[[[" + word + "]]]]]" + return f"{WRAP_PREFIX}{word}{WRAP_SUFFIX}" return word -def transcribe_plain_text(text, wrapper=False): - """Transcribe a plain text string into IPA, leaving non-Korean as-is (or wrapped).""" - result = [] +def transcribe_plain_text( + text, + wrapper=False, + stats=None, +): + """ + Transcribe a plain text string into IPA, leaving non-Korean as-is (or wrapped). + + If stats dict is provided, it will be updated with: + - transcribed_bytes: UTF-8 bytes of ORIGINAL tokens that were transcribed (Korean tokens only) + - not_transcribed_bytes: UTF-8 bytes of ORIGINAL tokens not transcribed (includes Latin, digits, punctuation) + Counts are based on ORIGINAL tokens, so wrapper overhead is excluded automatically. + """ + if stats is None: + stats = {} + + stats.setdefault("transcribed_bytes", 0) + stats.setdefault("not_transcribed_bytes", 0) + + out = [] words = re.findall(r'\w+|[^\w\s]', text, re.UNICODE) - for word in words: - if re.match(r'\w+', word): - result.append(handle_mixed_language(word, wrapper=wrapper)) + for tok in words: + tok_bytes = utf8_len(tok) + + if re.match(r'\w+', tok): + if tok.isdigit(): + stats["not_transcribed_bytes"] += tok_bytes + elif is_korean_token(tok): + stats["transcribed_bytes"] += tok_bytes + else: + stats["not_transcribed_bytes"] += tok_bytes + + out.append(handle_mixed_language(tok, wrapper=wrapper)) else: - result.append(word) - return " ".join(result) + # punctuation/symbols + stats["not_transcribed_bytes"] += tok_bytes + out.append(tok) + + return " ".join(out) -def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa', wrapper=False): +def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa', wrapper=False, stats=None): """ Transcribe multilingual sentences and update JSON data directly. - Args: - sentences: JSON string or a loaded JSON object. - input_json_key: Key to extract sentences from in a JSON. - output_json_key: Key to store IPA transcription in the JSON (default: 'ipa'). - - Returns: - The modified JSON string with IPA transcriptions added. + Returns the modified JSON string with IPA transcriptions added. + If stats dict is provided, it will be updated with byte coverage counts. """ + if stats is None: + stats = {} + stats.setdefault("transcribed_bytes", 0) + stats.setdefault("not_transcribed_bytes", 0) + try: data = json.loads(sentences) if isinstance(sentences, str) else sentences if not isinstance(data, list): @@ -79,8 +119,8 @@ def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa for item in data: if input_json_key in item: sentence = item[input_json_key] - transcription_result = transcribe_plain_text(sentence, wrapper=wrapper) - item[output_json_key] = transcription_result # Update directly + transcription_result = transcribe_plain_text(sentence, wrapper=wrapper, stats=stats) + item[output_json_key] = transcription_result print(transcription_result) else: print(f"Warning: Key '{input_json_key}' not found in item: {item}") @@ -92,9 +132,39 @@ def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa return json.dumps(data, ensure_ascii=False, indent=4) +def finalize_and_print_stats(stats, stats_json_path=None): + transcribed = int(stats.get("transcribed_bytes", 0)) + not_tx = int(stats.get("not_transcribed_bytes", 0)) + total = transcribed + not_tx + pct_tx = (transcribed / total * 100.0) if total else 0.0 + pct_not = (not_tx / total * 100.0) if total else 0.0 + + out_stats = { + "transcribed_bytes": transcribed, + "not_transcribed_bytes": not_tx, + "total_bytes": total, + "pct_transcribed": pct_tx, + "pct_not_transcribed": pct_not, + } + + print("\n=== Byte Coverage Stats (based on ORIGINAL tokens) ===") + print(f"Transcribed bytes : {out_stats['transcribed_bytes']}") + print(f"Not transcribed bytes : {out_stats['not_transcribed_bytes']}") + print(f"Total bytes (counted) : {out_stats['total_bytes']}") + print(f"% transcribed : {out_stats['pct_transcribed']:.2f}%") + print(f"% not transcribed : {out_stats['pct_not_transcribed']:.2f}%") + + if stats_json_path: + with open(stats_json_path, "w", encoding="utf-8") as sf: + json.dump(out_stats, sf, ensure_ascii=False, indent=2) + print(f"Stats JSON written to: {stats_json_path}") + + return out_stats + + def main(): parser = argparse.ArgumentParser( - description='Transcribe multilingual text or JSON into IPA phonemes (Korean via espeak-ng).' + description='Transcribe multilingual text or JSON into IPA phonemes (Korean via espeak-ng), with byte coverage stats.' ) parser.add_argument( @@ -132,11 +202,20 @@ def main(): "--wrapper", default=False, action=argparse.BooleanOptionalAction, - help="Wrap unparseable text with [[[[[square brackets]]]]], for later recovery." + help="Wrap unparseable/non-target tokens with [[[[[...]]]]]. Use --no-wrapper to leave them unchanged." + ) + + parser.add_argument( + "--stats_json", + type=str, + default=None, + help="Optional: write byte coverage stats as JSON to this path (in addition to printing)." ) args = parser.parse_args() + stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0} + try: with open(args.input_file, 'r', encoding='utf-8') as f: input_content = f.read() @@ -145,7 +224,8 @@ def main(): if args.text_input: transcription = transcribe_plain_text( input_content, - wrapper=args.wrapper + wrapper=args.wrapper, + stats=stats ) if args.text_output: @@ -164,11 +244,11 @@ def main(): input_content, args.input_json_key, args.output_json_key, - wrapper=args.wrapper + wrapper=args.wrapper, + stats=stats ) if updated_json_data: - # Default behavior: overwrite original JSON if args.text_output: with open(args.text_output, 'w', encoding='utf-8') as f: f.write(updated_json_data) @@ -178,6 +258,8 @@ def main(): f.write(updated_json_data) print(f"Successfully updated JSON data in '{args.input_file}'") + finalize_and_print_stats(stats, stats_json_path=args.stats_json) + except FileNotFoundError: print(f"Error: Input file '{args.input_file}' not found.") except ValueError as e: diff --git a/data/template/utils/zh_to_ipa.py b/data/template/utils/zh_to_ipa.py index 387bca3303..987e112487 100644 --- a/data/template/utils/zh_to_ipa.py +++ b/data/template/utils/zh_to_ipa.py @@ -9,6 +9,20 @@ import json +WRAP_PREFIX = "[[[[[" +WRAP_SUFFIX = "]]]]]" + + +def utf8_len(s: str) -> int: + return len(s.encode("utf-8")) + + +def is_chinese_token(token: str) -> bool: + # Keeps your original behavior: "Chinese" means "contains any simplified Hanzi" + # (This will miss pure-traditional-only text; expand if you want.) + return any(hanzi.is_simplified(ch) for ch in token) + + def transcribe_chinese(sentence: str) -> str: """Transcribe a Chinese sentence into its phonemes using dragonmapper.""" try: @@ -20,12 +34,12 @@ def transcribe_chinese(sentence: str) -> str: def handle_mixed_language(word: str, wrapper: bool = True) -> str: """Handle a word with potential Chinese, other language, or number content.""" - if word.isdigit(): # Detect numbers (pass through unchanged) + if word.isdigit(): # numbers: passthrough return word - elif any(hanzi.is_simplified(char) for char in word): # Detect Simplified Chinese chars + elif is_chinese_token(word): # Chinese: IPA return transcribe_chinese(word) - else: # Non-Chinese word - return f"[[[[[{word}]]]]]" if wrapper else word + else: # Non-Chinese: wrap or passthrough + return f"{WRAP_PREFIX}{word}{WRAP_SUFFIX}" if wrapper else word def transcribe_multilingual( @@ -39,84 +53,106 @@ def transcribe_multilingual( """ Transcribe multilingual sentences (Chinese + non-Chinese passthrough/wrap) and save to a file. - Args: - data: The input data (list of dicts if JSON, list of strings if plain text). - output_file: Path to the output file. - json_inplace_update: If True, process JSON input and add IPA to the same JSON objects. - json_input_field: The field in the JSON data to transcribe (default: "sentence"). - json_output_field: The field to write the IPA transcription to (default: "sentence_ipa"). - wrapper: If True, wrap non-Chinese tokens like [[[[[token]]]]]. If False, leave them unchanged. + Also computes byte counts: + - transcribed_bytes: UTF-8 bytes of ORIGINAL tokens that were transcribed (Chinese tokens) + - not_transcribed_bytes: UTF-8 bytes of ORIGINAL tokens that were not transcribed + (includes Latin words, digits, punctuation, etc.) + These counts are based on ORIGINAL text tokens, so wrapper overhead is automatically excluded. + + Returns: + stats dict with transcribed_bytes, not_transcribed_bytes, total_bytes, and percents. """ + transcribed_bytes = 0 + not_transcribed_bytes = 0 + + def process_sentence(sentence: str) -> str: + nonlocal transcribed_bytes, not_transcribed_bytes + + # Split sentence using jieba (your original behavior) + seg_list = jieba.cut(sentence, cut_all=False) + seg_sentence = "".join(seg_list) + + # Split but keep punctuation + words = re.findall(r"\w+|[^\w\s]", seg_sentence, re.UNICODE) + + out_parts = [] + for tok in words: + tok_bytes = utf8_len(tok) + + if re.match(r"\w+", tok): + # word-ish token + if tok.isdigit(): + not_transcribed_bytes += tok_bytes + elif is_chinese_token(tok): + transcribed_bytes += tok_bytes + else: + not_transcribed_bytes += tok_bytes + + out_parts.append(handle_mixed_language(tok, wrapper=wrapper)) + else: + # punctuation / symbols + not_transcribed_bytes += tok_bytes + out_parts.append(tok) + + return " ".join(out_parts) + if json_inplace_update: # In-place update for JSON data for item in data: if json_input_field in item: sentence = item[json_input_field] - result = [] - - # Split sentence using jieba - seg_list = jieba.cut(sentence, cut_all=False) - seg_sentence = "".join(seg_list) + item[json_output_field] = process_sentence(sentence) - # Split sentence but keep punctuation - words = re.findall(r'\w+|[^\w\s]', seg_sentence, re.UNICODE) - for word in words: - if re.match(r'\w+', word): # Only process words - result.append(handle_mixed_language(word, wrapper=wrapper)) - else: - result.append(word) # Preserve punctuation - - transcription_result = " ".join(result) - item[json_output_field] = transcription_result - - with open(output_file, 'w', encoding='utf-8') as f: + with open(output_file, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=4) print(f"In-place JSON transcription saved to {output_file}") else: - # Standard transcription (either JSON or plain text to plain text output) - with open(output_file, 'w', encoding='utf-8') as f: + # Standard transcription to plain text output (one line per item) + with open(output_file, "w", encoding="utf-8") as f: for item in data: - result = [] if isinstance(item, dict): sentence = item.get(json_input_field, "") else: sentence = item - # Split sentence using jieba - seg_list = jieba.cut(sentence, cut_all=False) - seg_sentence = "".join(seg_list) + transcription_result = process_sentence(sentence) + f.write(transcription_result + "\n") + print(transcription_result) - # Split sentence but keep punctuation - words = re.findall(r'\w+|[^\w\s]', seg_sentence, re.UNICODE) - for word in words: - if re.match(r'\w+', word): # Only process words - result.append(handle_mixed_language(word, wrapper=wrapper)) - else: - result.append(word) # Preserve punctuation + total_bytes = transcribed_bytes + not_transcribed_bytes + pct_transcribed = (transcribed_bytes / total_bytes * 100.0) if total_bytes else 0.0 + pct_not = (not_transcribed_bytes / total_bytes * 100.0) if total_bytes else 0.0 - transcription_result = " ".join(result) - f.write(transcription_result + "\n") - print(transcription_result) # Print to console + stats = { + "transcribed_bytes": transcribed_bytes, + "not_transcribed_bytes": not_transcribed_bytes, + "total_bytes": total_bytes, + "pct_transcribed": pct_transcribed, + "pct_not_transcribed": pct_not, + } + return stats def main(): - parser = argparse.ArgumentParser(description='Transcribe multilingual sentences into IPA phonemes (Chinese via dragonmapper).') + parser = argparse.ArgumentParser( + description="Transcribe multilingual sentences into IPA phonemes (Chinese via dragonmapper), with byte coverage stats." + ) parser.add_argument( - 'input_file', + "input_file", type=str, - help='Path to the input file containing sentences in json or text format.' + help="Path to the input file containing sentences in json or text format." ) parser.add_argument( - 'output_file', + "output_file", type=str, - help='Path to the output file for IPA transcription.' + help="Path to the output file for IPA transcription." ) parser.add_argument( - '--input_type', + "--input_type", type=str, - choices=['json', 'text'], - default='json', + choices=["json", "text"], + default="json", help='Type of input file: "json" or "text" (default: json)' ) parser.add_argument( @@ -140,18 +176,23 @@ def main(): action=argparse.BooleanOptionalAction, help="Wrap non-Chinese tokens as [[[[[...]]]]] (default: true). Use --no-wrapper to leave them unchanged." ) + parser.add_argument( + "--stats_json", + type=str, + default=None, + help="Optional: write stats as JSON to this path (in addition to printing)." + ) args = parser.parse_args() try: - with open(args.input_file, 'r', encoding='utf-8') as f: - if args.input_type == 'json': + with open(args.input_file, "r", encoding="utf-8") as f: + if args.input_type == "json": data = json.load(f) else: - # Keep lines as strings; strip newline later if you want data = [line.rstrip("\n") for line in f.readlines()] - transcribe_multilingual( + stats = transcribe_multilingual( data=data, output_file=args.output_file, json_inplace_update=args.json_inplace_update, @@ -160,6 +201,19 @@ def main(): wrapper=args.wrapper, ) + # Print summary stats (wrapper overhead is automatically excluded because we count ORIGINAL token bytes) + print("\n=== Byte Coverage Stats (based on ORIGINAL tokens) ===") + print(f"Transcribed bytes : {stats['transcribed_bytes']}") + print(f"Not transcribed bytes : {stats['not_transcribed_bytes']}") + print(f"Total bytes (counted) : {stats['total_bytes']}") + print(f"% transcribed : {stats['pct_transcribed']:.2f}%") + print(f"% not transcribed : {stats['pct_not_transcribed']:.2f}%") + + if args.stats_json: + with open(args.stats_json, "w", encoding="utf-8") as sf: + json.dump(stats, sf, ensure_ascii=False, indent=2) + print(f"Stats JSON written to: {args.stats_json}") + except FileNotFoundError: print(f"Error: Input file '{args.input_file}' not found.") except json.JSONDecodeError: @@ -168,6 +222,6 @@ def main(): print(f"An unexpected error occurred: {e}") -if __name__ == '__main__': +if __name__ == "__main__": main() From 21dab6240444b0d2ccb185e25b49473ebd8a71c4 Mon Sep 17 00:00:00 2001 From: klei22 Date: Tue, 23 Dec 2025 18:51:36 -0800 Subject: [PATCH 02/10] Adding stats for en, zh, and ja --- data/flores200-res/eng_stats.json | 7 +++++++ data/flores200-res/ja_stats.json | 7 +++++++ data/flores200-res/zh_stats.json | 7 +++++++ 3 files changed, 21 insertions(+) create mode 100644 data/flores200-res/eng_stats.json create mode 100644 data/flores200-res/ja_stats.json create mode 100644 data/flores200-res/zh_stats.json diff --git a/data/flores200-res/eng_stats.json b/data/flores200-res/eng_stats.json new file mode 100644 index 0000000000..abcded9fae --- /dev/null +++ b/data/flores200-res/eng_stats.json @@ -0,0 +1,7 @@ +{ + "transcribed_bytes": 208634, + "not_transcribed_bytes": 7905, + "total_bytes": 216539, + "pct_transcribed": 96.34938740827288, + "pct_not_transcribed": 3.6506125917271253 +} \ No newline at end of file diff --git a/data/flores200-res/ja_stats.json b/data/flores200-res/ja_stats.json new file mode 100644 index 0000000000..30b83d8af6 --- /dev/null +++ b/data/flores200-res/ja_stats.json @@ -0,0 +1,7 @@ +{ + "transcribed_bytes": 305426, + "not_transcribed_bytes": 21596, + "total_bytes": 327022, + "pct_transcribed": 93.3961629492817, + "pct_not_transcribed": 6.6038370507183 +} \ No newline at end of file diff --git a/data/flores200-res/zh_stats.json b/data/flores200-res/zh_stats.json new file mode 100644 index 0000000000..eda44595ef --- /dev/null +++ b/data/flores200-res/zh_stats.json @@ -0,0 +1,7 @@ +{ + "transcribed_bytes": 208747, + "not_transcribed_bytes": 27209, + "total_bytes": 235956, + "pct_transcribed": 88.4686127922155, + "pct_not_transcribed": 11.531387207784501 +} \ No newline at end of file From 8a5ec691e0688c66d1a9acd191dd5454cdca5db6 Mon Sep 17 00:00:00 2001 From: klei22 Date: Sun, 28 Dec 2025 13:21:43 -0800 Subject: [PATCH 03/10] Add README.md stats and script updates --- data/flores200-res/README.md | 20 ++ data/flores200-res/get_dataset.sh | 215 +++++++++++++++++++- data/flores200-res/ko_stats.json | 7 + data/flores200-res/phoneticize.sh | 23 ++- data/template/utils/espeak2ipa.py | 326 +++++++++++++++++------------- 5 files changed, 446 insertions(+), 145 deletions(-) create mode 100644 data/flores200-res/README.md create mode 100644 data/flores200-res/ko_stats.json diff --git a/data/flores200-res/README.md b/data/flores200-res/README.md new file mode 100644 index 0000000000..aab97633c7 --- /dev/null +++ b/data/flores200-res/README.md @@ -0,0 +1,20 @@ +# Scripts compatible with Flores-200 Restructured + +This is a folder with scripts compatible with the Flores-200 project, originally +from: +https://github.com/facebookresearch/flores/blob/main/README.md + +Though scripts target the restructured format proposed by muhammadravi251001: +https://huggingface.co/datasets/muhammadravi251001/restructured-flores200 + +# License of dataset + +The Flores 200 dataset is licensed under CC-By-SA 4.0. + +## Language Codes + +Language Codes here for Flore-200: +https://github.com/facebookresearch/flores/blob/main/flores200/README.md + +Language Codes here for espeak (basis of many of the phoneticizers): +https://espeak.sourceforge.net/languages.html diff --git a/data/flores200-res/get_dataset.sh b/data/flores200-res/get_dataset.sh index deda56df65..d95b965614 100644 --- a/data/flores200-res/get_dataset.sh +++ b/data/flores200-res/get_dataset.sh @@ -17,9 +17,222 @@ lang_array=( "text_jpn_Jpan" "text_kor_Hang" "text_zho_Hans" - "text_shn_Mymr" + "text_hin_Deva" + "text_vie_Latn" + "text_ind_Latn" + "text_swh_Latn" + "text_ell_Grek" + "text_fra_Latn" ) +# lang_array=( +# "text_ace_Arab" +# "text_ace_Latn" +# "text_acm_Arab" +# "text_acq_Arab" +# "text_aeb_Arab" +# "text_afr_Latn" +# "text_ajp_Arab" +# "text_aka_Latn" +# "text_als_Latn" +# "text_amh_Ethi" +# "text_apc_Arab" +# "text_arb_Arab" +# "text_arb_Latn" +# "text_ars_Arab" +# "text_ary_Arab" +# "text_arz_Arab" +# "text_asm_Beng" +# "text_ast_Latn" +# "text_awa_Deva" +# "text_ayr_Latn" +# "text_azb_Arab" +# "text_azj_Latn" +# "text_bak_Cyrl" +# "text_bam_Latn" +# "text_ban_Latn" +# "text_bel_Cyrl" +# "text_bem_Latn" +# "text_ben_Beng" +# "text_bho_Deva" +# "text_bjn_Arab" +# "text_bjn_Latn" +# "text_bod_Tibt" +# "text_bos_Latn" +# "text_bug_Latn" +# "text_bul_Cyrl" +# "text_cat_Latn" +# "text_ceb_Latn" +# "text_ces_Latn" +# "text_cjk_Latn" +# "text_ckb_Arab" +# "text_crh_Latn" +# "text_cym_Latn" +# "text_dan_Latn" +# "text_deu_Latn" +# "text_dik_Latn" +# "text_dyu_Latn" +# "text_dzo_Tibt" +# "text_ell_Grek" +# "text_eng_Latn" +# "text_epo_Latn" +# "text_est_Latn" +# "text_eus_Latn" +# "text_ewe_Latn" +# "text_fao_Latn" +# "text_fij_Latn" +# "text_fin_Latn" +# "text_fon_Latn" +# "text_fra_Latn" +# "text_fur_Latn" +# "text_fuv_Latn" +# "text_gaz_Latn" +# "text_gla_Latn" +# "text_gle_Latn" +# "text_glg_Latn" +# "text_grn_Latn" +# "text_guj_Gujr" +# "text_hat_Latn" +# "text_hau_Latn" +# "text_heb_Hebr" +# "text_hin_Deva" +# "text_hne_Deva" +# "text_hrv_Latn" +# "text_hun_Latn" +# "text_hye_Armn" +# "text_ibo_Latn" +# "text_ilo_Latn" +# "text_ind_Latn" +# "text_isl_Latn" +# "text_ita_Latn" +# "text_jav_Latn" +# "text_jpn_Jpan" +# "text_kab_Latn" +# +# "text_kac_Latn" +# "text_kam_Latn" +# "text_kan_Knda" +# "text_kas_Arab" +# "text_kas_Deva" +# "text_kat_Geor" +# "text_kaz_Cyrl" +# "text_kbp_Latn" +# "text_kea_Latn" +# "text_khk_Cyrl" +# "text_khm_Khmr" +# "text_kik_Latn" +# "text_kin_Latn" +# "text_kir_Cyrl" +# "text_kmb_Latn" +# "text_kmr_Latn" +# "text_knc_Arab" +# "text_knc_Latn" +# "text_kon_Latn" +# "text_kor_Hang" +# "text_lao_Laoo" +# "text_lij_Latn" +# "text_lim_Latn" +# "text_lin_Latn" +# "text_lit_Latn" +# "text_lmo_Latn" +# "text_ltg_Latn" +# "text_ltz_Latn" +# "text_lua_Latn" +# "text_lug_Latn" +# "text_luo_Latn" +# "text_lus_Latn" +# "text_lvs_Latn" +# "text_mag_Deva" +# "text_mai_Deva" +# "text_mal_Mlym" +# "text_mar_Deva" +# "text_min_Arab" +# "text_min_Latn" +# "text_mkd_Cyrl" +# "text_mlt_Latn" +# "text_mni_Beng" +# "text_mos_Latn" +# "text_mri_Latn" +# "text_mya_Mymr" +# "text_nld_Latn" +# "text_nno_Latn" +# "text_nob_Latn" +# "text_npi_Deva" +# "text_nso_Latn" +# "text_nus_Latn" +# "text_nya_Latn" +# "text_oci_Latn" +# "text_ory_Orya" +# "text_pag_Latn" +# "text_pan_Guru" +# "text_pap_Latn" +# "text_pbt_Arab" +# "text_pes_Arab" +# "text_plt_Latn" +# "text_pol_Latn" +# "text_por_Latn" +# "text_prs_Arab" +# "text_quy_Latn" +# "text_ron_Latn" +# "text_run_Latn" +# "text_rus_Cyrl" +# "text_sag_Latn" +# "text_san_Deva" +# "text_sat_Olck" +# "text_scn_Latn" +# "text_shn_Mymr" +# "text_sin_Sinh" +# "text_slk_Latn" +# "text_slv_Latn" +# "text_smo_Latn" +# "text_sna_Latn" +# "text_snd_Arab" +# "text_som_Latn" +# "text_sot_Latn" +# "text_spa_Latn" +# "text_srd_Latn" +# "text_srp_Cyrl" +# "text_ssw_Latn" +# "text_sun_Latn" +# "text_swe_Latn" +# "text_swh_Latn" +# "text_szl_Latn" +# "text_tam_Taml" +# "text_taq_Latn" +# "text_taq_Tfng" +# "text_tat_Cyrl" +# "text_tel_Telu" +# "text_tgk_Cyrl" +# "text_tgl_Latn" +# "text_tha_Thai" +# "text_tir_Ethi" +# "text_tpi_Latn" +# "text_tsn_Latn" +# "text_tso_Latn" +# "text_tuk_Latn" +# "text_tum_Latn" +# "text_tur_Latn" +# "text_twi_Latn" +# "text_tzm_Tfng" +# "text_uig_Arab" +# "text_ukr_Cyrl" +# "text_umb_Latn" +# "text_urd_Arab" +# "text_uzn_Latn" +# "text_vec_Latn" +# "text_vie_Latn" +# "text_war_Latn" +# "text_wol_Latn" +# "text_xho_Latn" +# "text_ydd_Hebr" +# "text_yor_Latn" +# "text_yue_Hant" +# "text_zho_Hans" +# "text_zho_Hant" +# "text_zsm_Latn" +# "text_zul_Latn" +# ) + # Add url with dataset here: url="https://huggingface.co/datasets/muhammadravi251001/restructured-flores200/tree/main/data" diff --git a/data/flores200-res/ko_stats.json b/data/flores200-res/ko_stats.json new file mode 100644 index 0000000000..5330f63aca --- /dev/null +++ b/data/flores200-res/ko_stats.json @@ -0,0 +1,7 @@ +{ + "transcribed_bytes": 271690, + "not_transcribed_bytes": 8833, + "total_bytes": 280523, + "pct_transcribed": 96.85123857936783, + "pct_not_transcribed": 3.148761420632176 +} \ No newline at end of file diff --git a/data/flores200-res/phoneticize.sh b/data/flores200-res/phoneticize.sh index b52f0c2789..98fee2e617 100644 --- a/data/flores200-res/phoneticize.sh +++ b/data/flores200-res/phoneticize.sh @@ -1,9 +1,22 @@ #!/bin/bash -python3 utils/en2ipa.py text_eng_Latn.txt --mode text --output_file ipa_text_eng_Latn.txt --no-wrapper --stats_json eng_stats.json -python3 utils/ja2ipa.py text_jpn_Jpan.txt ipa_text_jpn_Jpan.txt --text_output --use_spacy --text_no_sentence --stats_json ja_stats.json -python3 utils/ko_en_to_ipa.py text_kor_Hang.txt --text_input --text_output ipa_text_kor_Hang.txt --stats_json ko_stats.json -python3 utils/zh_to_ipa.py text_zho_Hans.txt ipa_text_zho_Hans.txt --input_type text --no-wrapper --stats_json zh_stats.json -python3 utils/espeak2ipa.py text_shn_Mymr.txt --mode text --output_file ipa_text_shan.txt --no-wrapper --stats_json shan_stats.json --lang shan +# python3 utils/en2ipa.py text_eng_Latn.txt --mode text --output_file ipa_text_eng_Latn.txt --no-wrapper --stats_json eng_stats.json +# python3 utils/ja2ipa.py text_jpn_Jpan.txt ipa_text_jpn_Jpan.txt --text_output --use_spacy --text_no_sentence --stats_json ja_stats.json +# python3 utils/ko_en_to_ipa.py text_kor_Hang.txt --text_input --text_output ipa_text_kor_Hang.txt --stats_json ko_stats.json +# python3 utils/zh_to_ipa.py text_zho_Hans.txt ipa_text_zho_Hans.txt --input_type text --no-wrapper --stats_json zh_stats.json +lang_array=( + "text_vie_Latn:vi" + "text_ind_Latn:id" + "text_swh_Latn:sw" + "text_ell_Grek:el" + "text_fra_Latn:fr" +) + +for lang in "${lang_array[@]}"; do + text_file="${lang%%:*}" + two_letter_code="${lang##*:}" + echo "${text_file}; ${two_letter_code}" + python3 utils/espeak2ipa.py "$text_file".txt --mode text --output_file ipa_"$text_file".txt --no-wrapper --stats_json stats_"$text_file".json --lang "$two_letter_code" --text_no_sentence +done diff --git a/data/template/utils/espeak2ipa.py b/data/template/utils/espeak2ipa.py index 9c2864afc0..0e017747fa 100644 --- a/data/template/utils/espeak2ipa.py +++ b/data/template/utils/espeak2ipa.py @@ -2,27 +2,38 @@ # espeak2ipa.py # # Generic IPA transcription using espeak-ng for ANY supported voice. -# Defaults to "shan" (you can override with --lang). +# Defaults to "shn" (override with --lang). # -# Features (modeled after your en2ipa.py): -# - JSON list mode (--mode json): in-place update of a JSON list file -# - Text mode (--mode text): line-by-line transcription to output file (or overwrite input) +# Features: +# - JSON list mode (--mode json): +# - default: overwrite input JSON file adding output_json_key per item +# - with --text_output: emit a text file (sentenceipa OR ipa-only via --text_no_sentence) +# - Text mode (--mode text): input is one sentence per line +# - default: emits IPA-only (backward-compatible with your existing espeak2ipa.py) +# - with --text_output: emits sentenceipa (JP-like), unless --text_no_sentence # - Optional wrapping for untranscribed/unparseable tokens: [[[[[...]]]]] # - Multithreading with ordered output # - Rich progress bar # - Byte coverage stats (based on ORIGINAL tokens; wrapper overhead excluded) # # Notes: -# - "transcribed_bytes" counts bytes of ORIGINAL tokens we ATTEMPT to send to espeak +# - "transcribed_bytes" counts UTF-8 bytes of ORIGINAL tokens we ATTEMPT to send to espeak # (tokens that contain at least one Unicode letter). Digits/punct count as not_transcribed. -# - espeak-ng voices vary; if a voice is unavailable, you'll get an error/empty output. +# - If espeak-ng outputs empty text for a token, we treat it as "unparseable" and optionally wrap it. import subprocess import argparse import re import json -from typing import List, Tuple, Optional, Dict, Any -from rich.progress import Progress, BarColumn, TextColumn, TimeRemainingColumn, TimeElapsedColumn, MofNCompleteColumn +from typing import List, Optional, Dict, Any, Tuple +from rich.progress import ( + Progress, + BarColumn, + TextColumn, + TimeRemainingColumn, + TimeElapsedColumn, + MofNCompleteColumn, +) from concurrent.futures import ThreadPoolExecutor, as_completed import os import threading @@ -55,7 +66,7 @@ def transcribe_espeak(token: str, lang: str, wrapper: bool = False) -> str: result = subprocess.run( ["espeak-ng", "-q", "-v", lang, "--ipa", token], capture_output=True, - text=True + text=True, ) out = (result.stdout or "").strip().replace("ㆍ", " ") if not out: @@ -97,12 +108,18 @@ def tokens_to_ipa_string(tokens: List[str], lang: str, wrapper: bool) -> str: return " ".join(out) -def _worker_sentence(sentence: str, lang: str, wrapper: bool, stats: Optional[Dict[str, int]] = None) -> str: +def _worker_sentence( + sentence: str, + lang: str, + wrapper: bool, + stats: Optional[Dict[str, int]] = None, +) -> str: """ Tokenize & transcribe one sentence/line. + If stats is provided, updates byte counts based on ORIGINAL tokens: - transcribed_bytes: tokens containing at least one letter - - not_transcribed_bytes: digits + punctuation/symbols + other \w tokens with no letters + - not_transcribed_bytes: digits + punctuation/symbols + other \\w tokens with no letters """ tokens = _WORD_RE.findall(sentence) @@ -134,97 +151,19 @@ def _progress() -> Progress: ) -def transcribe_json_list( - json_text_or_obj, - input_json_key: str, - output_json_key: str, - lang: str, - wrapper: bool, - multithread: bool, - workers: int, - stats: Optional[Dict[str, int]] = None, -) -> Optional[str]: - """ - JSON list mode: reads a JSON list of objects, writes output_json_key for each object. - Returns JSON string (pretty printed). - """ - try: - data = json.loads(json_text_or_obj) if isinstance(json_text_or_obj, str) else json_text_or_obj - if not isinstance(data, list): - raise ValueError("JSON data should be a list of objects.") - except Exception as e: - print(f"Error: {e}") - return None - - n = len(data) - if stats is None: - stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0} - else: - stats.setdefault("transcribed_bytes", 0) - stats.setdefault("not_transcribed_bytes", 0) - - if n == 0: - return json.dumps(data, ensure_ascii=False, indent=4) - - if not multithread or workers <= 1: - with _progress() as progress: - task = progress.add_task("Processing JSON items", total=n) - for item in data: - if input_json_key in item: - sentence = item[input_json_key] - item[output_json_key] = _worker_sentence(sentence, lang=lang, wrapper=wrapper, stats=stats) - progress.update(task, advance=1) - else: - # ordered results - results: List[Tuple[int, str]] = [None] * n # type: ignore - per_item_stats: List[Dict[str, int]] = [None] * n # type: ignore - - jobs = [(i, data[i].get(input_json_key, "")) for i in range(n)] - - with _progress() as progress: - task = progress.add_task(f"Processing JSON items (mt x{workers})", total=n) - with ThreadPoolExecutor(max_workers=workers) as ex: - future_to_idx = {} - for idx, sentence in jobs: - local_stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0} - per_item_stats[idx] = local_stats - fut = ex.submit(_worker_sentence, sentence, lang, wrapper, local_stats) - future_to_idx[fut] = idx - - for fut in as_completed(future_to_idx): - idx = future_to_idx[fut] - try: - res = fut.result() - except Exception as e: - res = f"Error: {e}" - results[idx] = (idx, res) - progress.update(task, advance=1) - - # merge stats - for st in per_item_stats: - stats["transcribed_bytes"] += st.get("transcribed_bytes", 0) - stats["not_transcribed_bytes"] += st.get("not_transcribed_bytes", 0) - - # write back in original order - for idx, item in enumerate(data): - if input_json_key in item: - item[output_json_key] = results[idx][1] - - return json.dumps(data, ensure_ascii=False, indent=4) - - -def transcribe_text_lines( - lines: List[str], +def transcribe_sentences( + sentences: List[str], lang: str, wrapper: bool, multithread: bool, workers: int, stats: Optional[Dict[str, int]] = None, + progress_label: str = "Processing", ) -> List[str]: """ - Text mode: input is one sentence per line. Output is one IPA line per input line. + Transcribe a list of sentences into IPA, returning results in the same order. """ - n = len(lines) + n = len(sentences) if stats is None: stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0} else: @@ -235,41 +174,52 @@ def transcribe_text_lines( return [] if not multithread or workers <= 1: - out_lines: List[str] = [] + out: List[str] = [] with _progress() as progress: - task = progress.add_task("Processing text lines", total=n) - for line in lines: - raw = line.rstrip("\n") - out_lines.append(_worker_sentence(raw, lang=lang, wrapper=wrapper, stats=stats)) + task = progress.add_task(progress_label, total=n) + for s in sentences: + out.append(_worker_sentence(s, lang=lang, wrapper=wrapper, stats=stats)) progress.update(task, advance=1) - return out_lines + return out - out_lines: List[str] = [None] * n # type: ignore + # Multithreaded path: per-item stats then merge at end + out: List[str] = ["" for _ in range(n)] per_item_stats: List[Dict[str, int]] = [None] * n # type: ignore with _progress() as progress: - task = progress.add_task(f"Processing text lines (mt x{workers})", total=n) + task = progress.add_task(f"{progress_label} (mt x{workers})", total=n) with ThreadPoolExecutor(max_workers=workers) as ex: future_to_idx = {} - for i in range(n): + for i, s in enumerate(sentences): local_stats = {"transcribed_bytes": 0, "not_transcribed_bytes": 0} per_item_stats[i] = local_stats - fut = ex.submit(_worker_sentence, lines[i].rstrip("\n"), lang, wrapper, local_stats) + fut = ex.submit(_worker_sentence, s, lang, wrapper, local_stats) future_to_idx[fut] = i for fut in as_completed(future_to_idx): i = future_to_idx[fut] try: - out_lines[i] = fut.result() + out[i] = fut.result() except Exception as e: - out_lines[i] = f"Error: {e}" + out[i] = f"Error: {e}" progress.update(task, advance=1) for st in per_item_stats: stats["transcribed_bytes"] += st.get("transcribed_bytes", 0) stats["not_transcribed_bytes"] += st.get("not_transcribed_bytes", 0) - return out_lines + return out + + +def format_text_lines( + sentences: List[str], + ipa_lines: List[str], + include_sentence: bool, + sep: str, +) -> List[str]: + if not include_sentence: + return ipa_lines + return [f"{s}{sep}{ipa}" for s, ipa in zip(sentences, ipa_lines)] def finalize_and_print_stats(stats: Dict[str, int], stats_json_path: Optional[str] = None) -> Dict[str, Any]: @@ -306,42 +256,99 @@ def finalize_and_print_stats(stats: Dict[str, int], stats_json_path: Optional[st def main(): parser = argparse.ArgumentParser( - description="Generic IPA transcription using espeak-ng for any supported voice (default: shan). " - "Supports JSON list mode and plain-text line mode, with byte coverage stats." + description=( + "Generic IPA transcription using espeak-ng for any supported voice (default: shn). " + "Supports JSON list mode and plain-text line mode, with byte coverage stats.\n\n" + "NEW: --text_output and --text_no_sentence (JP-style) to optionally emit only IPA." + ) ) parser.add_argument("input_file", type=str, help="Path to the input file (JSON list or plain text).") # Language / voice - parser.add_argument("--lang", default="shan", - help="espeak-ng voice/language code (default: shan). Example: en, fr, de, es, ja, zh, etc.") + parser.add_argument( + "--lang", + default="shn", + help="espeak-ng voice/language code (default: shn). Example: en, fr, de, es, ja, zh, etc.", + ) # Mode selection - parser.add_argument("--mode", choices=["json", "text"], default="json", - help='Processing mode. "json" expects a JSON list; "text" treats file as plain text.') + parser.add_argument( + "--mode", + choices=["json", "text"], + default="json", + help='Processing mode. "json" expects a JSON list; "text" treats file as plain text.', + ) # JSON mode params - parser.add_argument("--input_json_key", type=str, - help="JSON key to read sentences from (required for --mode json).") - parser.add_argument("--output_json_key", type=str, default="ipa", - help='JSON key to store IPA (default: "ipa").') + parser.add_argument( + "--input_json_key", + type=str, + help="JSON key to read sentences from (required for --mode json).", + ) + parser.add_argument( + "--output_json_key", + type=str, + default="ipa", + help='JSON key to store IPA (default: "ipa").', + ) - # Text mode params - parser.add_argument("--output_file", type=str, default=None, - help="Output file path for text mode. Defaults to overwriting input.") + # Output path (used for text outputs; in JSON update mode we overwrite input_file) + parser.add_argument( + "--output_file", + type=str, + default=None, + help="Output file path for text outputs. In --mode text, defaults to overwriting input.", + ) # Wrapper option - parser.add_argument("--wrapper", default=False, action=argparse.BooleanOptionalAction, - help="Wrap unparseable tokens with [[[[[...]]]]] (default: false).") + parser.add_argument( + "--wrapper", + default=False, + action=argparse.BooleanOptionalAction, + help="Wrap unparseable tokens with [[[[[...]]]]] (default: false).", + ) # Multithreading options - parser.add_argument("--multithread", default=False, action=argparse.BooleanOptionalAction, - help="Enable multithreading while preserving output order.") - parser.add_argument("--workers", type=int, default=os.cpu_count() or 4, - help="Number of worker threads when --multithread is enabled (default: CPU count).") + parser.add_argument( + "--multithread", + default=False, + action=argparse.BooleanOptionalAction, + help="Enable multithreading while preserving output order.", + ) + parser.add_argument( + "--workers", + type=int, + default=os.cpu_count() or 4, + help="Number of worker threads when --multithread is enabled (default: CPU count).", + ) # Stats output - parser.add_argument("--stats_json", type=str, default=None, - help="Optional: write byte coverage stats as JSON to this path (in addition to printing).") + parser.add_argument( + "--stats_json", + type=str, + default=None, + help="Optional: write byte coverage stats as JSON to this path (in addition to printing).", + ) + + # NEW: JP-style text emission controls + parser.add_argument( + "--text_output", + action="store_true", + help=( + "Emit text output lines instead of JSON update in --mode json. " + 'In --mode text, when set, emit "sentenceipa" lines (unless --text_no_sentence).' + ), + ) + parser.add_argument( + "--text_no_sentence", + action="store_true", + help="In text output mode, emit only the IPA (omit the original sentence).", + ) + parser.add_argument( + "--text_sep", + default="\t", + help='Separator used between sentence and IPA in text output mode (default: tab).', + ) args = parser.parse_args() @@ -357,40 +364,79 @@ def main(): raise ValueError("--input_json_key is required when --mode json") with open(args.input_file, "r", encoding="utf-8") as f: - input_content = f.read() + data = json.load(f) + + if not isinstance(data, list): + raise ValueError("JSON data should be a list of objects.") - updated_json = transcribe_json_list( - input_content, - input_json_key=args.input_json_key, - output_json_key=args.output_json_key, + # collect sentences (only items that contain input_json_key) + indices: List[int] = [] + sentences: List[str] = [] + for i, item in enumerate(data): + if isinstance(item, dict) and args.input_json_key in item: + indices.append(i) + sentences.append(str(item[args.input_json_key])) + + ipa_lines = transcribe_sentences( + sentences, lang=args.lang, wrapper=args.wrapper, multithread=args.multithread, workers=args.workers, stats=stats, + progress_label="Processing JSON items", ) - if updated_json is not None: - # matches your existing style: overwrite JSON input file + + if args.text_output: + include_sentence = not args.text_no_sentence + out_lines = format_text_lines(sentences, ipa_lines, include_sentence, args.text_sep) + + target_path = args.output_file + if not target_path: + target_path = args.input_file + ".ipa.txt" + + with open(target_path, "w", encoding="utf-8") as f: + f.write("\n".join(out_lines) + ("\n" if out_lines else "")) + + print(f"✅ Successfully wrote text output to '{target_path}'") + else: + # default behavior: update JSON in-place (overwrite input_file) + for idx, ipa in zip(indices, ipa_lines): + data[idx][args.output_json_key] = ipa + with open(args.input_file, "w", encoding="utf-8") as f: - f.write(updated_json) + json.dump(data, f, ensure_ascii=False, indent=4) + print(f"✅ Successfully updated JSON data in '{args.input_file}'") else: + # ---- TEXT MODE ---- with open(args.input_file, "r", encoding="utf-8") as f: - lines = f.readlines() + raw_lines = f.readlines() + + sentences = [ln.rstrip("\n") for ln in raw_lines] - out_lines = transcribe_text_lines( - lines, + ipa_lines = transcribe_sentences( + sentences, lang=args.lang, wrapper=args.wrapper, multithread=args.multithread, workers=args.workers, stats=stats, + progress_label="Processing text lines", ) + if args.text_output: + include_sentence = not args.text_no_sentence + out_lines = format_text_lines(sentences, ipa_lines, include_sentence, args.text_sep) + else: + # backward-compatible default: IPA-only + out_lines = ipa_lines + target_path = args.output_file if args.output_file else args.input_file with open(target_path, "w", encoding="utf-8") as f: f.write("\n".join(out_lines) + ("\n" if out_lines else "")) + print(f"✅ Successfully wrote transcribed text to '{target_path}'") finalize_and_print_stats(stats, stats_json_path=args.stats_json) @@ -399,6 +445,8 @@ def main(): print(f"Error: Input file '{args.input_file}' not found.") except ValueError as ve: print(f"Error: {ve}") + except json.JSONDecodeError: + print(f"Error: Invalid JSON format in '{args.input_file}'.") if __name__ == "__main__": From 8e07a234d62312ded41c9c608380350ab7f969f2 Mon Sep 17 00:00:00 2001 From: klei22 Date: Sat, 3 Jan 2026 10:15:37 -0800 Subject: [PATCH 04/10] Add Yue --- data/flores200-res/get_dataset.sh | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/data/flores200-res/get_dataset.sh b/data/flores200-res/get_dataset.sh index d95b965614..b3096de376 100644 --- a/data/flores200-res/get_dataset.sh +++ b/data/flores200-res/get_dataset.sh @@ -1,22 +1,12 @@ #!/bin/bash -### Instructions: -# 1. Replace "INSERT_URL_WITH_FILES" with the actual URL to the Parquet files. -# 2. Modify the "include_keys" array to specify the keys you want to include in the output. -# 3. (Optionally) Modify the "value_prefixes" array to set prefixes for each value, use "" for empty prefixes -# 4. Set "--skip_empty" to true if you want to skip empty fields, or false if not needed. -# 5. Set "--no_output_text" to true if you plan to process the intermediate json files in a custom manner. -# 6. For CSV files with BOM headers, pass "--input_encoding utf-8-sig" to the helper script. -# 7. For CSV cells that contain multi-line text, use "--split_multiline_values" to emit one line per entry or -# "--newline_replacement" to substitute newline characters with custom text. - -# Run the Python script with the specified arguments lang_array=( "text_eng_Latn" "text_jpn_Jpan" "text_kor_Hang" "text_zho_Hans" + "text_yue_Hant" "text_hin_Deva" "text_vie_Latn" "text_ind_Latn" @@ -242,5 +232,4 @@ for lang in "${lang_array[@]}"; do --include_keys "$lang" \ --value_prefix $'\n' \ --output_text_file "$lang".txt - done From 8346e3a60804637850e65354f3ba3347307f1d59 Mon Sep 17 00:00:00 2001 From: klei22 Date: Sat, 3 Jan 2026 10:16:35 -0800 Subject: [PATCH 05/10] Add Yue to get dataset --- data/flores200-res/phoneticize.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/data/flores200-res/phoneticize.sh b/data/flores200-res/phoneticize.sh index 98fee2e617..2552a922ff 100644 --- a/data/flores200-res/phoneticize.sh +++ b/data/flores200-res/phoneticize.sh @@ -12,11 +12,14 @@ lang_array=( "text_swh_Latn:sw" "text_ell_Grek:el" "text_fra_Latn:fr" + "text_yue_Hant:yue" ) for lang in "${lang_array[@]}"; do text_file="${lang%%:*}" two_letter_code="${lang##*:}" echo "${text_file}; ${two_letter_code}" - python3 utils/espeak2ipa.py "$text_file".txt --mode text --output_file ipa_"$text_file".txt --no-wrapper --stats_json stats_"$text_file".json --lang "$two_letter_code" --text_no_sentence + if [ ! -f "ipa_${text_file}.txt" ]; then + python3 utils/espeak2ipa.py "$text_file".txt --mode text --output_file ipa_"$text_file".txt --no-wrapper --stats_json stats_"$text_file".json --lang "$two_letter_code" --text_no_sentence + fi done From fe33152876404b22594cbc577600c70bfa839ef8 Mon Sep 17 00:00:00 2001 From: klei22 Date: Sat, 3 Jan 2026 11:04:16 -0800 Subject: [PATCH 06/10] Add graphs for grouping bytes of languages --- data/flores200-res/graphs.sh | 9 + .../plot_langscript_sizes_grouped.py | 216 ++++++++++++++++++ 2 files changed, 225 insertions(+) create mode 100644 data/flores200-res/graphs.sh create mode 100644 data/flores200-res/plot_langscript_sizes_grouped.py diff --git a/data/flores200-res/graphs.sh b/data/flores200-res/graphs.sh new file mode 100644 index 0000000000..aabf067fda --- /dev/null +++ b/data/flores200-res/graphs.sh @@ -0,0 +1,9 @@ +#!/bin/bash +python3 plot_langscript_sizes_grouped.py --group-by script --color-by script --out by_script.png +python3 plot_langscript_sizes_grouped.py --group-by script --color-by region --out by_region_script.png +python3 plot_langscript_sizes_grouped.py --group-by region --color-by region --out by_region.png +python3 plot_langscript_sizes_grouped.py --group-by family --color-by family --out by_family.png +python3 plot_langscript_sizes_grouped.py --group-by family --color-by script --out by_family_script.png + + + diff --git a/data/flores200-res/plot_langscript_sizes_grouped.py b/data/flores200-res/plot_langscript_sizes_grouped.py new file mode 100644 index 0000000000..c1462018c1 --- /dev/null +++ b/data/flores200-res/plot_langscript_sizes_grouped.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +import json +import re +from collections import Counter, defaultdict +import matplotlib.pyplot as plt + +# text__ + + + +
+

{title}

+ Click a row (or use dropdown) to update per-file counts. +
+ +
+
+

Vocab + total frequency (directory aggregate)

+
+ + + + +
+
+
+ Showing top {len(token_rows)} tokens by frequency. +
+
+ +
+

Per-file counts

+
+
+ Bars show token count per file (same tokenization as training). +
+
+
+ + + + +""" + out_path.write_text(html, encoding="utf-8") + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--vocab", default="trained_spm_model.vocab", + help="SentencePiece vocab file (default: trained_spm_model.vocab). Used to infer .model if --model not given.") + ap.add_argument("--model", default=None, + help="SentencePiece model file (.model). If omitted, inferred from --vocab by replacing .vocab -> .model.") + ap.add_argument("--dir", required=True, + help="Directory of text files to scan.") + ap.add_argument("--recursive", action="store_true", + help="Recurse into subdirectories (default: false).") + ap.add_argument("--suffixes", default=".txt", + help="Comma-separated suffixes to include (default: .txt). Example: .txt,.md") + ap.add_argument("--top-k", type=int, default=1500, + help="Embed only top-K tokens by total frequency into the HTML for interactivity (default: 1500).") + ap.add_argument("--out", default="vocab_freq_dashboard.html", + help="Output HTML path (default: vocab_freq_dashboard.html)") + ap.add_argument("--min-count", type=int, default=1, + help="Only consider tokens with total count >= this (default: 1).") + args = ap.parse_args() + + vocab_path = Path(args.vocab) + model_path = Path(args.model) if args.model else infer_model_path_from_vocab(vocab_path) + root = Path(args.dir) + out = Path(args.out) + + if not model_path.exists(): + raise SystemExit(f"SentencePiece model not found: {model_path} (pass --model or ensure it matches --vocab)") + if not root.exists() or not root.is_dir(): + raise SystemExit(f"Directory not found: {root}") + + suffixes = tuple(s.strip().lower() for s in args.suffixes.split(",") if s.strip()) + files = iter_text_files(root, recursive=args.recursive, suffixes=suffixes) + if not files: + raise SystemExit(f"No files found in {root} with suffixes={suffixes} (try --recursive or --suffixes)") + + sp = spm.SentencePieceProcessor(model_file=str(model_path)) + vocab_size = sp.get_piece_size() + + print(f"[info] model: {model_path}") + print(f"[info] vocab size: {vocab_size}") + print(f"[info] scanning {len(files)} files under: {root}") + + total = Counter() + per_file: Dict[str, Counter] = {} + file_names: List[str] = [] + + for p in files: + rel = str(p.relative_to(root)) + file_names.append(rel) + c = count_tokens_in_file(sp, p) + per_file[rel] = c + total.update(c) + + # Build top tokens (by total frequency) + items = [(tid, cnt) for tid, cnt in total.items() if cnt >= args.min_count] + items.sort(key=lambda x: x[1], reverse=True) + + if not items: + raise SystemExit(f"No tokens met min-count={args.min_count} (unexpected).") + + top_items = items[: max(1, args.top_k)] + + token_rows: List[Dict] = [] + # per_file_counts: token_id(str) -> file -> count (only for embedded tokens) + per_file_counts: Dict[str, Dict[str, int]] = {} + + for tid, cnt in top_items: + tok = human_token(sp.id_to_piece(int(tid))) + token_rows.append({"id": int(tid), "token": tok, "count": int(cnt)}) + + # Build per-file map for embedded tokens + for tid, _ in top_items: + tid = int(tid) + k = str(tid) + per_file_counts[k] = {} + for fn in file_names: + v = per_file[fn].get(tid, 0) + if v: + per_file_counts[k][fn] = int(v) + + default_token_id = int(top_items[0][0]) + + title = f"SentencePiece token frequency dashboard ({root.name})" + build_html( + title=title, + token_rows=token_rows, + per_file_counts=per_file_counts, + file_order=file_names, + default_token_id=default_token_id, + out_path=out, + ) + + print(f"[done] wrote: {out}") + print(f"[note] Uses Plotly CDN for interactivity; open the HTML in your browser.") + + +if __name__ == "__main__": + main() + diff --git a/data/flores200-res/tokenize_and_annotate_sizes.py b/data/flores200-res/tokenize_and_annotate_sizes.py index 152c9599a8..e39d834899 100644 --- a/data/flores200-res/tokenize_and_annotate_sizes.py +++ b/data/flores200-res/tokenize_and_annotate_sizes.py @@ -81,7 +81,7 @@ def main() -> None: ap.add_argument( "--in-json", - default="filtered_scripts.json", + default="filtered_files.json", help="Input JSON from filter_files_by_script.py", ) ap.add_argument( From 686f2b2b5affba3402d9ba1c8da0363ef895adae Mon Sep 17 00:00:00 2001 From: klei22 Date: Sat, 3 Jan 2026 13:05:48 -0800 Subject: [PATCH 09/10] Add .gitignore --- data/flores200-res/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 data/flores200-res/.gitignore diff --git a/data/flores200-res/.gitignore b/data/flores200-res/.gitignore new file mode 100644 index 0000000000..e33609d251 --- /dev/null +++ b/data/flores200-res/.gitignore @@ -0,0 +1 @@ +*.png From 375e5b945908c1ceacb8a0f50fc0006fdfae2662 Mon Sep 17 00:00:00 2001 From: klei22 Date: Sat, 3 Jan 2026 19:10:11 -0800 Subject: [PATCH 10/10] Add updates to latest scripts --- data/flores200-res/ipa_scripts.sh | 11 +- .../flores200-res/spm_vocab_freq_dashboard.py | 338 ++++++++++++++---- 2 files changed, 276 insertions(+), 73 deletions(-) diff --git a/data/flores200-res/ipa_scripts.sh b/data/flores200-res/ipa_scripts.sh index 4b004aa1a6..ead45ced52 100644 --- a/data/flores200-res/ipa_scripts.sh +++ b/data/flores200-res/ipa_scripts.sh @@ -1,8 +1,8 @@ # include tokenized comparison (uses tokenized_sizes["tiktoken"] from filtered_scripts.json) -python3 plot_ipa_vs_text.py \ - --text-dir text --ipa-dir ipa \ - --filtered-json filtered_files.json \ - --tok-method tiktoken +# python3 plot_ipa_vs_text.py \ +# --text-dir text --ipa-dir ipa \ +# --filtered-json filtered_files.json \ +# --tok-method tiktoken # # save everything to plots_out/ # python3 plot_ipa_vs_text.py \ @@ -18,3 +18,6 @@ python3 plot_ipa_vs_text.py \ # --tok-method tiktoken \ # --skip-missing-tok + +python3 plot_ipa_vs_text.py --text-dir text --ipa-dir ipa --save --outdir plots_out --csv + diff --git a/data/flores200-res/spm_vocab_freq_dashboard.py b/data/flores200-res/spm_vocab_freq_dashboard.py index f07da105e9..5a0bb941d5 100644 --- a/data/flores200-res/spm_vocab_freq_dashboard.py +++ b/data/flores200-res/spm_vocab_freq_dashboard.py @@ -2,18 +2,22 @@ """ spm_vocab_freq_dashboard.py -Build a single self-contained HTML dashboard (Plotly + vanilla JS) that shows: +Single-script, self-contained HTML dashboard (Plotly + vanilla JS) that shows: LEFT: - - SentencePiece vocab tokens + total frequency across *all* .txt files in a directory - - searchable dropdown to pick a token (and optional click-to-select from table) + - SentencePiece vocab tokens + total frequency across *all* text files in a directory + - searchable dropdown to pick a token + - click-to-select from the token table -RIGHT: - - per-file counts for the currently-selected token (bar chart) - - updates live in the same HTML (no server) +RIGHT (top): + - per-file counts for the selected token (bar chart) + +RIGHT (bottom): + - square similarity heatmap clustering text files by similarity across high-frequency vocab + (cosine similarity over TF-IDF on top vocab tokens) Why we require a .model: - - SentencePiece tokenization is not plain substring matching; to get true token frequencies, + - SentencePiece tokenization is not substring matching; to get true token frequencies, we MUST encode text using the SentencePiece model. Defaults: @@ -25,8 +29,8 @@ vocab_freq_dashboard.html (or --out) Example: - python3 spm_vocab_freq_dashboard.py --dir ./text --vocab trained_spm_model.vocab - python3 spm_vocab_freq_dashboard.py --dir ./text --model trained_spm_model.model --top-k 2000 + python3 spm_vocab_freq_dashboard.py --dir ./text --vocab trained_spm_model.vocab --heatmap + python3 spm_vocab_freq_dashboard.py --dir ./text --heatmap-top-k 500 --recursive """ from __future__ import annotations @@ -35,24 +39,26 @@ import json from collections import Counter from pathlib import Path -from typing import Dict, List, Tuple +from typing import Dict, List, Tuple, Optional import sentencepiece as spm +# For heatmap similarity: NumPy required (SciPy not needed) +try: + import numpy as np +except Exception as e: + np = None + _NUMPY_IMPORT_ERROR = e + def infer_model_path_from_vocab(vocab_path: Path) -> Path: - # trained_spm_model.vocab -> trained_spm_model.model if vocab_path.suffix.lower() == ".vocab": return vocab_path.with_suffix(".model") - # fallback: append .model return Path(str(vocab_path) + ".model") def iter_text_files(root: Path, recursive: bool, suffixes: Tuple[str, ...]) -> List[Path]: - if recursive: - it = root.rglob("*") - else: - it = root.glob("*") + it = root.rglob("*") if recursive else root.glob("*") files = [] for p in it: if p.is_file() and p.suffix.lower() in suffixes: @@ -74,34 +80,179 @@ def count_tokens_in_file(sp: spm.SentencePieceProcessor, path: Path) -> Counter: def human_token(tok: str) -> str: - # Make SentencePiece boundary visible and avoid crazy HTML rendering. - # Keep it readable: ▁ (U+2581) is the "word boundary" marker in SPM. return tok.replace("\t", " ").replace("\n", "\\n") +def _build_tfidf_matrix( + file_names: List[str], + per_file: Dict[str, Counter], + token_ids: List[int], +) -> "np.ndarray": + """ + docs x tokens TF-IDF, L2-normalized per doc + """ + assert np is not None + n_docs = len(file_names) + n_tok = len(token_ids) + if n_docs == 0 or n_tok == 0: + return np.zeros((n_docs, n_tok), dtype=np.float32) + + tok_to_col = {tid: j for j, tid in enumerate(token_ids)} + + # document frequency + df = np.zeros((n_tok,), dtype=np.int32) + for fn in file_names: + c = per_file[fn] + for tid in c.keys(): + j = tok_to_col.get(tid) + if j is not None: + df[j] += 1 + + # smooth idf + idf = np.log((n_docs + 1.0) / (df.astype(np.float32) + 1.0)) + 1.0 + + X = np.zeros((n_docs, n_tok), dtype=np.float32) + for i, fn in enumerate(file_names): + c = per_file[fn] + row = X[i] + for tid, cnt in c.items(): + j = tok_to_col.get(tid) + if j is not None: + row[j] = float(cnt) + + row *= idf + + # L2 normalize + norm = float(np.linalg.norm(row)) + if norm > 0: + row /= norm + + return X + + +def _cosine_similarity_matrix(X: "np.ndarray") -> "np.ndarray": + """ + X assumed rows L2-normalized; cosine similarity = X @ X.T + """ + assert np is not None + if X.size == 0: + return np.zeros((X.shape[0], X.shape[0]), dtype=np.float32) + S = X @ X.T + # numerical guard + S = np.clip(S, -1.0, 1.0).astype(np.float32) + # make diagonal exactly 1 + n = S.shape[0] + for i in range(n): + S[i, i] = 1.0 + return S + + +def _order_by_simple_clustering(S: "np.ndarray") -> List[int]: + """ + Optional: reorder files so similar ones are near each other, without SciPy. + Greedy "nearest neighbor chain" heuristic: + - start from most "central" (max average similarity) + - repeatedly append most similar unused item to the last + """ + assert np is not None + n = S.shape[0] + if n <= 2: + return list(range(n)) + + avg = S.mean(axis=1) + start = int(np.argmax(avg)) + order = [start] + used = set(order) + + while len(order) < n: + last = order[-1] + # pick unused with max similarity to last + best_j = None + best_val = -1e9 + for j in range(n): + if j in used: + continue + v = float(S[last, j]) + if v > best_val: + best_val = v + best_j = j + order.append(int(best_j)) + used.add(int(best_j)) + + return order + + +def _build_heatmap_payload( + file_names: List[str], + per_file: Dict[str, Counter], + token_ids_for_heatmap: List[int], + reorder: bool, +) -> Dict: + """ + Returns plotly-ready payload for similarity heatmap. + """ + if np is None: + raise RuntimeError( + "NumPy is required for heatmap mode.\n" + f"Import error: {_NUMPY_IMPORT_ERROR!r}\n" + "Install: python3 -m pip install numpy" + ) + + if len(file_names) < 2: + return {"ok": False, "reason": "Need at least 2 files to build a similarity heatmap."} + + X = _build_tfidf_matrix(file_names, per_file, token_ids_for_heatmap) + S = _cosine_similarity_matrix(X) + + idx = list(range(len(file_names))) + if reorder: + idx = _order_by_simple_clustering(S) + + labels = [file_names[i] for i in idx] + S2 = S[np.ix_(idx, idx)] + + # convert to nested lists for JSON + z = S2.tolist() + + traces = [{ + "type": "heatmap", + "z": z, + "x": labels, + "y": labels, + "zmin": 0.0, + "zmax": 1.0, + "hovertemplate": "x=%{x}
y=%{y}
cosine=%{z:.3f}", + # no explicit colorscale specified (Plotly default) to match your earlier “no custom colors” vibe + }] + + layout = { + "margin": {"l": 120, "r": 20, "t": 40, "b": 120}, + "title": "File similarity heatmap (TF-IDF on high-freq SPM vocab, cosine similarity)", + "xaxis": {"tickangle": 35, "automargin": True}, + "yaxis": {"automargin": True}, + } + + return {"ok": True, "traces": traces, "layout": layout} + + def build_html( title: str, token_rows: List[Dict], per_file_counts: Dict[str, Dict[str, int]], file_order: List[str], default_token_id: int, + heatmap_payload: Optional[Dict], out_path: Path, ) -> None: - """ - token_rows: list of dicts for top-k tokens: {id, token, count} - per_file_counts: { token_id(str) -> { file_name -> count } } only for tokens we embed - file_order: stable order of files for bar chart - """ payload = { "title": title, "tokens": token_rows, "per_file": per_file_counts, "files": file_order, "default_token_id": default_token_id, + "heatmap": heatmap_payload, } - # NOTE: This uses Plotly CDN for a "dynamic" interactive page without extra deps. - # If you need fully offline HTML (no CDN), we can embed plotly.min.js, but the file is huge. html = f""" @@ -170,10 +321,21 @@ def build_html( flex: 1; min-height: 200px; }} + .rightCharts {{ + display: flex; + flex-direction: column; + gap: 10px; + flex: 1; + overflow: hidden; + }} #barDiv {{ flex: 1; min-height: 200px; }} + #heatDiv {{ + flex: 1; + min-height: 260px; + }} .note {{ font-size: 12px; color: #555; @@ -187,7 +349,7 @@ def build_html(

{title}

- Click a row (or use dropdown) to update per-file counts. + Pick a token to update per-file counts; heatmap shows file similarity via high-frequency vocab.
@@ -206,10 +368,13 @@ def build_html(
-

Per-file counts

-
+

Per-file counts + similarity heatmap

+
+
+
+
- Bars show token count per file (same tokenization as training). + Heatmap uses TF-IDF over high-frequency SentencePiece tokens and cosine similarity.
@@ -218,9 +383,7 @@ def build_html( const DATA = {json.dumps(payload, ensure_ascii=False)}; function fmtTokenRow(t) {{ - // Make whitespace visible-ish in dropdown let s = t.token; - // show the word-boundary marker as "▁" (already is), but keep readable if (s.length > 60) s = s.slice(0, 57) + "…"; return `${{t.id}}: ${{s}} (${{t.count}})`; }} @@ -243,7 +406,6 @@ def build_html( }} function renderTable(tokens) {{ - // Plotly table const ids = tokens.map(t => t.id); const toks = tokens.map(t => t.token); const counts = tokens.map(t => t.count); @@ -261,13 +423,10 @@ def build_html( }} }}]; - const layout = {{ + Plotly.newPlot("tableDiv", tableData, {{ margin: {{l: 10, r: 10, t: 10, b: 10}}, - }}; - - Plotly.newPlot("tableDiv", tableData, layout, {{displayModeBar: false}}); + }}, {{displayModeBar: false}}); - // Click-to-select token: for tables, plotly_click gives pointNumber (row index) const tableDiv = document.getElementById("tableDiv"); tableDiv.on("plotly_click", (ev) => {{ try {{ @@ -288,32 +447,52 @@ def build_html( const xs = DATA.files.slice(); const ys = xs.map(fn => (per[fn] || 0)); - const trace = {{ + Plotly.newPlot("barDiv", [{{ type: "bar", x: xs, y: ys - }}; - - const layout = {{ + }}], {{ margin: {{l: 50, r: 10, t: 30, b: 120}}, - xaxis: {{ - tickangle: 35, - automargin: true - }}, - yaxis: {{ - title: "Count" - }}, + xaxis: {{ tickangle: 35, automargin: true }}, + yaxis: {{ title: "Count" }}, title: `Token: ${{name}} (id=${{tokenId}})` - }}; + }}, {{displayModeBar: true}}); +}} + +function renderHeatmap() {{ + const h = DATA.heatmap; + const div = document.getElementById("heatDiv"); + + if (!h) {{ + Plotly.newPlot(div, [], {{ + margin: {{l: 20, r: 10, t: 30, b: 30}}, + title: "Similarity heatmap: not computed", + annotations: [{{ + text: "No heatmap payload present.", + xref: "paper", yref: "paper", x: 0.5, y: 0.5, showarrow: false + }}] + }}, {{displayModeBar: false}}); + return; + }} + + if (!h.ok) {{ + Plotly.newPlot(div, [], {{ + margin: {{l: 20, r: 10, t: 30, b: 30}}, + title: "Similarity heatmap: unavailable", + annotations: [{{ + text: h.reason || "Unavailable", + xref: "paper", yref: "paper", x: 0.5, y: 0.5, showarrow: false + }}] + }}, {{displayModeBar: false}}); + return; + }} - Plotly.newPlot("barDiv", [trace], layout, {{displayModeBar: true}}); - document.getElementById("rightTitle").textContent = "Per-file counts"; + Plotly.newPlot(div, h.traces, h.layout, {{displayModeBar: true}}); }} function selectToken(tokenId, updateSelect) {{ if (updateSelect) {{ - const sel = document.getElementById("tokenSelect"); - sel.value = String(tokenId); + document.getElementById("tokenSelect").value = String(tokenId); }} renderBar(tokenId); }} @@ -321,11 +500,12 @@ def build_html( function init() {{ buildSelectOptions(DATA.tokens); - // Default selection const sel = document.getElementById("tokenSelect"); sel.value = String(DATA.default_token_id); + renderTable(DATA.tokens); renderBar(DATA.default_token_id); + renderHeatmap(); sel.addEventListener("change", (e) => {{ selectToken(e.target.value, false); @@ -336,19 +516,14 @@ def build_html( const q = e.target.value || ""; const filtered = filterTokens(DATA.tokens, q); - // update dropdown to filtered list, but keep current selection if still present const cur = document.getElementById("tokenSelect").value; buildSelectOptions(filtered); - // if current selection is in filtered list, keep it; else select first const hasCur = filtered.some(t => String(t.id) === String(cur)); const newId = hasCur ? cur : (filtered.length ? String(filtered[0].id) : String(DATA.default_token_id)); document.getElementById("tokenSelect").value = newId; - // rerender table with filtered tokens renderTable(filtered); - - // update bar based on dropdown selection selectToken(newId, false); }}); }} @@ -373,12 +548,22 @@ def main() -> None: help="Recurse into subdirectories (default: false).") ap.add_argument("--suffixes", default=".txt", help="Comma-separated suffixes to include (default: .txt). Example: .txt,.md") + ap.add_argument("--top-k", type=int, default=1500, - help="Embed only top-K tokens by total frequency into the HTML for interactivity (default: 1500).") - ap.add_argument("--out", default="vocab_freq_dashboard.html", - help="Output HTML path (default: vocab_freq_dashboard.html)") + help="Embed only top-K tokens by total frequency into the HTML (default: 1500).") ap.add_argument("--min-count", type=int, default=1, help="Only consider tokens with total count >= this (default: 1).") + ap.add_argument("--out", default="vocab_freq_dashboard.html", + help="Output HTML path (default: vocab_freq_dashboard.html)") + + # NEW: heatmap controls (no dendro mode; just heatmap) + ap.add_argument("--heatmap", action="store_true", + help="Compute and embed a file similarity heatmap (requires numpy).") + ap.add_argument("--heatmap-top-k", type=int, default=300, + help="Use top-K frequent tokens (from the directory) as features for TF-IDF similarity (default: 300).") + ap.add_argument("--heatmap-reorder", action="store_true", + help="Reorder files to group similar ones (simple greedy heuristic, no SciPy).") + args = ap.parse_args() vocab_path = Path(args.vocab) @@ -397,10 +582,9 @@ def main() -> None: raise SystemExit(f"No files found in {root} with suffixes={suffixes} (try --recursive or --suffixes)") sp = spm.SentencePieceProcessor(model_file=str(model_path)) - vocab_size = sp.get_piece_size() print(f"[info] model: {model_path}") - print(f"[info] vocab size: {vocab_size}") + print(f"[info] vocab size: {sp.get_piece_size()}") print(f"[info] scanning {len(files)} files under: {root}") total = Counter() @@ -414,24 +598,21 @@ def main() -> None: per_file[rel] = c total.update(c) - # Build top tokens (by total frequency) + # Token UI: top tokens by directory frequency items = [(tid, cnt) for tid, cnt in total.items() if cnt >= args.min_count] items.sort(key=lambda x: x[1], reverse=True) - if not items: raise SystemExit(f"No tokens met min-count={args.min_count} (unexpected).") top_items = items[: max(1, args.top_k)] token_rows: List[Dict] = [] - # per_file_counts: token_id(str) -> file -> count (only for embedded tokens) per_file_counts: Dict[str, Dict[str, int]] = {} for tid, cnt in top_items: tok = human_token(sp.id_to_piece(int(tid))) token_rows.append({"id": int(tid), "token": tok, "count": int(cnt)}) - # Build per-file map for embedded tokens for tid, _ in top_items: tid = int(tid) k = str(tid) @@ -443,6 +624,22 @@ def main() -> None: default_token_id = int(top_items[0][0]) + heatmap_payload: Optional[Dict] = None + if args.heatmap: + # Features for similarity + feat_tok_ids = [int(tid) for tid, _ in items[: max(2, args.heatmap_top_k)]] + print(f"[info] heatmap features: top {len(feat_tok_ids)} tokens (TF-IDF)") + try: + heatmap_payload = _build_heatmap_payload( + file_names=file_names, + per_file=per_file, + token_ids_for_heatmap=feat_tok_ids, + reorder=args.heatmap_reorder, + ) + except Exception as e: + heatmap_payload = {"ok": False, "reason": f"Failed to build heatmap: {e!r}"} + print(f"[warn] heatmap failed: {e!r}") + title = f"SentencePiece token frequency dashboard ({root.name})" build_html( title=title, @@ -450,11 +647,14 @@ def main() -> None: per_file_counts=per_file_counts, file_order=file_names, default_token_id=default_token_id, + heatmap_payload=heatmap_payload, out_path=out, ) print(f"[done] wrote: {out}") - print(f"[note] Uses Plotly CDN for interactivity; open the HTML in your browser.") + print("[note] Uses Plotly CDN for interactivity; open the HTML in your browser.") + if args.heatmap and (np is None): + print("[note] Install heatmap deps: python3 -m pip install numpy") if __name__ == "__main__":