From 8a5933f9dde8d2c4468d617ddd29dd2f3cb4354e Mon Sep 17 00:00:00 2001
From: Himansh raj <iamthehimansh@gmail.com>
Date: Tue, 9 Dec 2025 18:32:41 +0530
Subject: [PATCH 1/2] feat: Add high-level Python API with automatic voice
 loading

Add easy-to-use Python inference API with one-line synthesis, automatic
default voice loading, and comprehensive documentation.

Key features:
- synthesize_speech() one-line function
- Automatic default voice loading (7 voices included)
- Iterator support for LLM integration
- Complete documentation and examples
---
 PYTHON_API.md                | 230 +++++++++++++
 docs/python_inference.md     | 604 +++++++++++++++++++++++++++++++++++
 examples/simple_inference.py |  66 ++++
 vibevoice/__init__.py        |  67 ++++
 vibevoice/inference.py       | 583 +++++++++++++++++++++++++++++++++
 5 files changed, 1550 insertions(+)
 create mode 100644 PYTHON_API.md
 create mode 100644 docs/python_inference.md
 create mode 100644 examples/simple_inference.py
 create mode 100644 vibevoice/inference.py

diff --git a/PYTHON_API.md b/PYTHON_API.md
new file mode 100644
index 00000000..e02f2cf0
--- /dev/null
+++ b/PYTHON_API.md
@@ -0,0 +1,230 @@
+# VibeVoice Python API
+
+Easy-to-use Python API for real-time text-to-speech with VibeVoice.
+
+## Quick Start
+
+### One-Line Synthesis (Easiest!)
+
+```python
+from vibevoice import synthesize_speech
+
+# Simplest possible - automatically uses default voice!
+synthesize_speech("Hello world!", device="cuda")
+```
+
+### Class-Based API
+
+```python
+from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
+
+# Initialize TTS (automatically loads default voice)
+tts = VibeVoiceStreamingTTS(
+    model_path="microsoft/VibeVoice-Realtime-0.5B",
+    device="cuda"
+)
+
+# Initialize audio player
+player = AudioPlayer()
+
+# Generate text
+def text_gen():
+    for word in ["Hello", "world"]:
+        yield word
+
+# Generate and play
+audio = tts.text_to_speech_streaming(text_gen())
+player.play_stream(audio, realtime=True)
+```
+
+## Installation
+
+```bash
+# Install VibeVoice
+pip install -e .
+
+# Install audio playback support
+pip install sounddevice
+```
+
+## Features
+
+- ✅ **One-line synthesis** - `synthesize_speech("Hello!")`
+- ✅ **Automatic voice loading** - 7 default voices included, no setup needed!
+- ✅ **Real-time streaming** - ~100ms latency
+- ✅ **Voice cloning** - Use voice prompts for speaker cloning
+- ✅ **Speaker selection** - Choose output device
+- ✅ **Easy-to-use API** - Simple high-level interface
+- ✅ **GPU acceleration** - CUDA, Apple Silicon (MPS), CPU support
+- ✅ **Iterator-based** - Works with LLM token streams
+
+## Documentation
+
+- **[Python Inference Guide](docs/python_inference.md)** - Complete API reference
+- **[Examples](examples/)** - Code examples
+
+## API Overview
+
+### High-Level Functions
+
+```python
+from vibevoice import synthesize_speech, list_default_voices
+
+# One-line synthesis (easiest!)
+synthesize_speech("Hello world!", device="cuda")
+
+# List available default voices
+voices = list_default_voices()
+# ['en-Mike_man', 'en-Emma_woman', 'en-Carter_man', ...]
+
+# With iterator (LLM streaming)
+def text_gen():
+    for word in ["Hello", "world"]:
+        yield word
+synthesize_speech(text_gen(), device="cuda")
+```
+
+### Class-Based API
+
+```python
+from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
+
+# TTS with streaming (automatically loads default voice)
+tts = VibeVoiceStreamingTTS(
+    model_path="microsoft/VibeVoice-Realtime-0.5B",
+    voice_prompt_path="path/to/voice.pt",  # Optional - uses default if None
+    device="cuda",
+    inference_steps=5
+)
+
+# Audio player with device selection
+player = AudioPlayer(device_id=None)  # None = default device
+player.play_stream(audio_iterator, realtime=True)
+```
+
+### Low-Level API
+
+```python
+from vibevoice import (
+    VibeVoiceStreamingForConditionalGenerationInference,
+    VibeVoiceStreamingProcessor,
+    AudioStreamer
+)
+
+# Direct model access for advanced users
+processor = VibeVoiceStreamingProcessor.from_pretrained(model_path)
+model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
+    model_path,
+    torch_dtype=torch.bfloat16,
+    device_map="cuda"
+)
+```
+
+## Examples
+
+### One-Line Synthesis
+
+```python
+from vibevoice import synthesize_speech
+
+# Simplest possible
+synthesize_speech("Hello world!", device="cuda")
+
+# With iterator
+def text_gen():
+    for word in ["Hello", "world"]:
+        yield word
+synthesize_speech(text_gen(), device="cuda")
+```
+
+### Basic TTS with Classes
+
+```python
+from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
+
+tts = VibeVoiceStreamingTTS("microsoft/VibeVoice-Realtime-0.5B", device="cuda")
+player = AudioPlayer()
+
+def text_gen():
+    yield "Hello world!"
+
+audio = tts.text_to_speech_streaming(text_gen())
+player.play_stream(audio, realtime=True)
+```
+
+### Save to File
+
+```python
+import numpy as np
+from vibevoice import VibeVoiceStreamingTTS
+
+tts = VibeVoiceStreamingTTS("microsoft/VibeVoice-Realtime-0.5B", device="cuda")
+
+chunks = list(tts.text_to_speech_streaming(text_gen()))
+full_audio = np.concatenate(chunks)
+tts.save_audio(full_audio, "output.wav")
+```
+
+### Voice Cloning
+
+```python
+from vibevoice import VibeVoiceStreamingTTS
+
+tts = VibeVoiceStreamingTTS(
+    model_path="microsoft/VibeVoice-Realtime-0.5B",
+    voice_prompt_path="voices/speaker.pt",  # Speaker embedding
+    device="cuda"
+)
+
+audio = tts.text_to_speech_streaming(text_gen())
+player.play_stream(audio, realtime=True)
+```
+
+### LLM Integration
+
+```python
+from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
+
+tts = VibeVoiceStreamingTTS("microsoft/VibeVoice-Realtime-0.5B", device="cuda")
+player = AudioPlayer()
+
+def llm_stream():
+    """Your LLM generates tokens here"""
+    for token in llm.generate():
+        yield token
+
+# Real-time TTS as LLM generates
+audio = tts.text_to_speech_streaming(llm_stream())
+player.play_stream(audio, realtime=True)
+```
+
+## Performance
+
+- **First audio chunk**: ~100-300ms (CUDA)
+- **Audio quality**: 24kHz sample rate
+- **Devices**: CUDA (fastest), MPS (Apple), CPU (slower)
+- **Inference steps**: 5 (fast) to 50 (high quality)
+
+## Requirements
+
+- Python >= 3.9
+- PyTorch >= 2.0
+- sounddevice (for audio playback)
+- CUDA toolkit (optional, for GPU)
+
+## License
+
+See [LICENSE](LICENSE) for details.
+
+## Citation
+
+If you use VibeVoice in your research, please cite:
+
+```bibtex
+@article{vibevoice2025,
+  title={VibeVoice: Real-time Streaming Text-to-Speech},
+  author={VibeVoice Team},
+  journal={Microsoft Research},
+  year={2025}
+}
+```
diff --git a/docs/python_inference.md b/docs/python_inference.md
new file mode 100644
index 00000000..ce8b1ea5
--- /dev/null
+++ b/docs/python_inference.md
@@ -0,0 +1,604 @@
+# VibeVoice Python Inference Guide
+
+Complete guide for using VibeVoice text-to-speech in Python with streaming support.
+
+## Table of Contents
+
+- [Installation](#installation)
+- [Quick Start](#quick-start)
+- [API Reference](#api-reference)
+  - [synthesize_speech()](#synthesize_speech) - High-level function (easiest!)
+  - [list_default_voices()](#list_default_voices) - List available voices
+  - [VibeVoiceStreamingTTS](#vibevoicestreamingtts)
+  - [AudioPlayer](#audioplayer)
+- [Advanced Usage](#advanced-usage)
+- [Examples](#examples)
+
+---
+
+## Installation
+
+```bash
+# Install VibeVoice
+pip install -e /path/to/VibeVoice
+
+# Install audio playback support (optional, required for AudioPlayer)
+pip install sounddevice
+```
+
+---
+
+## Quick Start
+
+### One-Line Synthesis (Easiest!)
+
+```python
+from vibevoice import synthesize_speech
+
+# Simplest possible - automatically uses default voice!
+synthesize_speech("Hello world!", device="cuda")
+```
+
+### Basic Text-to-Speech
+
+```python
+from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
+
+# Initialize TTS (automatically loads default voice)
+tts = VibeVoiceStreamingTTS(
+    model_path="microsoft/VibeVoice-Realtime-0.5B",
+    device="cuda"  # or "cpu" or "mps"
+)
+
+# Initialize audio player
+player = AudioPlayer()
+
+# Generate text
+def text_generator():
+    for word in ["Hello", "world", "from", "VibeVoice"]:
+        yield word
+
+# Generate and play audio in real-time
+audio_stream = tts.text_to_speech_streaming(text_generator())
+player.play_stream(audio_stream, realtime=True)
+```
+
+### With Voice Cloning
+
+```python
+# Load with voice prompt for voice cloning
+tts = VibeVoiceStreamingTTS(
+    model_path="microsoft/VibeVoice-Realtime-0.5B",
+    voice_prompt_path="path/to/voice.pt",  # Speaker embedding
+    device="cuda"
+)
+
+# Generate with cloned voice
+audio_stream = tts.text_to_speech_streaming(text_generator())
+player.play_stream(audio_stream, realtime=True)
+```
+
+---
+
+## API Reference
+
+### synthesize_speech()
+
+**Easiest way to use VibeVoice!** One-line function for text-to-speech synthesis.
+
+```python
+synthesize_speech(
+    text: str | Iterator[str],
+    model_path: str = "microsoft/VibeVoice-Realtime-0.5B",
+    voice_prompt_path: Optional[str] = None,
+    device: str = "cuda",
+    output_file: Optional[str] = None,
+    play_audio: bool = True,
+    speaker_device_id: Optional[int] = None,
+    inference_steps: int = 5,
+    cfg_scale: float = 1.5,
+    realtime: bool = True
+) -> Optional[np.ndarray]
+```
+
+**Parameters:**
+
+- `text` (str or Iterator[str]): Text to synthesize or iterator yielding text chunks
+- `model_path` (str): HuggingFace model ID (default: "microsoft/VibeVoice-Realtime-0.5B")
+- `voice_prompt_path` (str, optional): Custom voice prompt path. If None, uses default voice.
+- `device` (str): Device ("cuda", "mps", "cpu")
+- `output_file` (str, optional): Path to save WAV file
+- `play_audio` (bool): Whether to play audio (default: True)
+- `speaker_device_id` (int, optional): Speaker device ID (None for default)
+- `inference_steps` (int): Diffusion steps (5=fast, 50=quality)
+- `cfg_scale` (float): Guidance scale (1.0-2.0)
+- `realtime` (bool): Use streaming playback (default: True)
+
+**Returns:**
+- `np.ndarray` or `None`: Audio array if `output_file` specified, else None
+
+**Examples:**
+
+```python
+from vibevoice import synthesize_speech
+
+# Simple usage
+synthesize_speech("Hello world!", device="cuda")
+
+# With iterator (LLM streaming)
+def text_gen():
+    for word in ["Hello", "streaming", "world"]:
+        yield word
+synthesize_speech(text_gen(), device="cuda")
+
+# Save to file
+synthesize_speech("Save this", output_file="output.wav", device="cuda")
+
+# Custom voice
+synthesize_speech(
+    "Custom voice",
+    voice_prompt_path="voices/custom.pt",
+    device="cuda"
+)
+
+# High quality
+synthesize_speech(
+    "High quality",
+    inference_steps=50,
+    cfg_scale=2.0,
+    device="cuda"
+)
+```
+
+---
+
+### list_default_voices()
+
+List available default voice prompts included with VibeVoice.
+
+```python
+list_default_voices() -> list[str]
+```
+
+**Returns:**
+- `list[str]`: List of available voice names (without .pt extension)
+
+**Example:**
+
+```python
+from vibevoice import list_default_voices
+
+voices = list_default_voices()
+print(voices)
+# ['en-Carter_man', 'en-Davis_man', 'en-Emma_woman', 'en-Frank_man',
+#  'en-Grace_woman', 'en-Mike_man', 'in-Samuel_man']
+
+# Use a specific default voice
+voice_path = f"demo/voices/streaming_model/{voices[2]}.pt"  # en-Emma_woman
+synthesize_speech("Hello", voice_prompt_path=voice_path)
+```
+
+---
+
+### VibeVoiceStreamingTTS
+
+High-level wrapper for VibeVoice streaming text-to-speech.
+
+#### Constructor
+
+```python
+VibeVoiceStreamingTTS(
+    model_path: str,
+    voice_prompt_path: Optional[str] = None,
+    device: str = "cuda",
+    inference_steps: int = 5
+)
+```
+
+**Parameters:**
+
+- `model_path` (str): Path to VibeVoice model or HuggingFace model ID
+  - Example: `"microsoft/VibeVoice-Realtime-0.5B"`
+- `voice_prompt_path` (str, optional): Path to voice prompt file (.pt) for voice cloning
+  - **If None (default):** Automatically uses a default voice from `demo/voices/streaming_model/`
+  - **7 default voices available:** en-Mike_man, en-Emma_woman, en-Carter_man, en-Davis_man, en-Frank_man, en-Grace_woman, in-Samuel_man
+  - Use `list_default_voices()` to see available voices
+- `device` (str): Device to run on
+  - `"cuda"` - NVIDIA GPU (fastest, requires flash-attention-2)
+  - `"mps"` - Apple Silicon GPU
+  - `"cpu"` - CPU (slower)
+- `inference_steps` (int): Number of diffusion steps
+  - `5` - Fast, good quality (default, ~100ms latency)
+  - `50` - High quality (~500ms latency)
+
+#### Methods
+
+##### `text_to_speech_streaming(text_iterator, cfg_scale=1.5)`
+
+Generate speech from text iterator with real-time streaming.
+
+**Parameters:**
+
+- `text_iterator` (Iterator[str]): Iterator yielding text tokens/chunks
+- `cfg_scale` (float): Classifier-free guidance scale (1.0-2.0)
+  - `1.0` - Faster, lower quality
+  - `1.5` - Balanced (default)
+  - `2.0` - Better quality, slower
+
+**Returns:**
+
+- Iterator[np.ndarray]: Audio chunks as float32 numpy arrays
+
+**Example:**
+
+```python
+def text_gen():
+    for word in ["Hello", "world"]:
+        yield word
+
+for audio_chunk in tts.text_to_speech_streaming(text_gen()):
+    # audio_chunk is np.ndarray, shape (N,), dtype float32
+    # values are normalized to [-1.0, 1.0]
+    print(f"Received {len(audio_chunk)} samples")
+```
+
+##### `save_audio(audio, output_path)`
+
+Save generated audio to WAV file.
+
+**Parameters:**
+
+- `audio` (np.ndarray): Audio data
+- `output_path` (str): Path to save WAV file
+
+**Example:**
+
+```python
+import numpy as np
+
+# Collect all chunks
+chunks = list(tts.text_to_speech_streaming(text_gen()))
+
+# Concatenate and save
+full_audio = np.concatenate(chunks)
+tts.save_audio(full_audio, "output.wav")
+```
+
+---
+
+### AudioPlayer
+
+Audio player with speaker selection and streaming support.
+
+#### Constructor
+
+```python
+AudioPlayer(device_id: Optional[int] = None, sample_rate: int = 24000)
+```
+
+**Parameters:**
+
+- `device_id` (int, optional): Speaker device ID (None for default)
+- `sample_rate` (int): Audio sample rate in Hz (default 24000)
+
+#### Methods
+
+##### `list_devices(show_all=False)` [static]
+
+List available audio output devices.
+
+**Parameters:**
+
+- `show_all` (bool): If True, show all devices including duplicates
+
+**Example:**
+
+```python
+AudioPlayer.list_devices()
+# Output:
+# Available Audio Output Devices:
+# [3] Microsoft Sound Mapper - Output ⭐ DEFAULT
+# [4] Speakers (USB Audio Device)
+```
+
+##### `get_default_output_device()` [static]
+
+Get default output device ID.
+
+**Returns:**
+
+- int: Default device ID
+
+**Example:**
+
+```python
+device_id = AudioPlayer.get_default_output_device()
+player = AudioPlayer(device_id=device_id)
+```
+
+##### `play_stream(audio_iterator, realtime=True)`
+
+Play audio from an iterator of chunks.
+
+**Parameters:**
+
+- `audio_iterator` (Iterator[np.ndarray]): Iterator yielding audio chunks
+- `realtime` (bool): Streaming mode
+  - `True` - Real-time streaming with ~100ms latency
+  - `False` - Buffered playback (waits for all chunks, smooth)
+
+**Example:**
+
+```python
+# Real-time streaming (low latency)
+audio_stream = tts.text_to_speech_streaming(text_gen())
+player.play_stream(audio_stream, realtime=True)
+
+# Buffered playback (smooth, no gaps)
+audio_stream = tts.text_to_speech_streaming(text_gen())
+player.play_stream(audio_stream, realtime=False)
+```
+
+##### `stop()`
+
+Stop current playback.
+
+---
+
+## Advanced Usage
+
+### Select Specific Speaker
+
+```python
+# List available devices
+AudioPlayer.list_devices()
+
+# Use specific device
+player = AudioPlayer(device_id=4)  # Use device 4
+audio_stream = tts.text_to_speech_streaming(text_gen())
+player.play_stream(audio_stream, realtime=True)
+```
+
+### Custom Quality Settings
+
+```python
+# High quality (slower)
+tts = VibeVoiceStreamingTTS(
+    model_path="microsoft/VibeVoice-Realtime-0.5B",
+    device="cuda",
+    inference_steps=50  # More steps = better quality
+)
+
+audio_stream = tts.text_to_speech_streaming(
+    text_gen(),
+    cfg_scale=2.0  # Higher CFG = better quality
+)
+player.play_stream(audio_stream, realtime=True)
+```
+
+### Process Audio Chunks
+
+```python
+import soundfile as sf
+
+audio_chunks = []
+for audio_chunk in tts.text_to_speech_streaming(text_gen()):
+    # Process each chunk as it arrives
+    audio_chunks.append(audio_chunk)
+
+    # You can also apply effects here
+    # audio_chunk = apply_effects(audio_chunk)
+
+# Save all chunks
+full_audio = np.concatenate(audio_chunks)
+sf.write("output.wav", full_audio, tts.sample_rate)
+```
+
+### Streaming from LLM Output
+
+```python
+def llm_token_stream():
+    """Simulate LLM generating tokens"""
+    llm_output = [
+        "The", "weather", "today", "is", "sunny", "and", "warm."
+    ]
+    for token in llm_output:
+        yield token
+
+# Convert LLM output to speech in real-time
+audio_stream = tts.text_to_speech_streaming(llm_token_stream())
+player.play_stream(audio_stream, realtime=True)
+```
+
+---
+
+## Examples
+
+### Example 1: Simple TTS
+
+```python
+from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
+
+tts = VibeVoiceStreamingTTS(
+    model_path="microsoft/VibeVoice-Realtime-0.5B",
+    device="cuda"
+)
+
+player = AudioPlayer()
+
+def text_gen():
+    yield "Hello world!"
+
+audio = tts.text_to_speech_streaming(text_gen())
+player.play_stream(audio, realtime=True)
+```
+
+### Example 2: Voice Cloning
+
+```python
+from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
+
+# Load with voice prompt
+tts = VibeVoiceStreamingTTS(
+    model_path="microsoft/VibeVoice-Realtime-0.5B",
+    voice_prompt_path="voices/emma.pt",  # Clone Emma's voice
+    device="cuda"
+)
+
+player = AudioPlayer()
+
+def text_gen():
+    yield "This is a cloned voice speaking!"
+
+audio = tts.text_to_speech_streaming(text_gen())
+player.play_stream(audio, realtime=True)
+```
+
+### Example 3: Save to File
+
+```python
+import numpy as np
+from vibevoice import VibeVoiceStreamingTTS
+
+tts = VibeVoiceStreamingTTS(
+    model_path="microsoft/VibeVoice-Realtime-0.5B",
+    device="cuda"
+)
+
+def text_gen():
+    for sentence in ["Hello.", "How are you?", "Goodbye!"]:
+        yield sentence
+
+# Collect all chunks
+chunks = list(tts.text_to_speech_streaming(text_gen()))
+full_audio = np.concatenate(chunks)
+
+# Save to file
+tts.save_audio(full_audio, "output.wav")
+print("Audio saved to output.wav")
+```
+
+### Example 4: Multiple Speaker Devices
+
+```python
+from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
+
+# List available devices
+AudioPlayer.list_devices()
+
+# Use multiple devices
+player1 = AudioPlayer(device_id=3)  # Default speaker
+player2 = AudioPlayer(device_id=4)  # External speaker
+
+tts = VibeVoiceStreamingTTS(
+    model_path="microsoft/VibeVoice-Realtime-0.5B",
+    device="cuda"
+)
+
+def text_gen():
+    yield "Hello from VibeVoice!"
+
+# Play on device 1
+audio1 = tts.text_to_speech_streaming(text_gen())
+player1.play_stream(audio1, realtime=True)
+
+# Play on device 2
+audio2 = tts.text_to_speech_streaming(text_gen())
+player2.play_stream(audio2, realtime=True)
+```
+
+### Example 5: Real-time LLM Integration
+
+```python
+from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
+import threading
+
+tts = VibeVoiceStreamingTTS(
+    model_path="microsoft/VibeVoice-Realtime-0.5B",
+    device="cuda"
+)
+
+player = AudioPlayer()
+
+def llm_stream():
+    """Your LLM generates tokens here"""
+    tokens = [
+        "Once", "upon", "a", "time", "there", "was",
+        "a", "voice", "assistant", "that", "could", "speak."
+    ]
+    for token in tokens:
+        yield token
+
+# Generate and play simultaneously
+audio_stream = tts.text_to_speech_streaming(llm_stream())
+player.play_stream(audio_stream, realtime=True)
+```
+
+---
+
+## Performance Tips
+
+1. **Use CUDA** - GPU is much faster than CPU
+   ```python
+   tts = VibeVoiceStreamingTTS(model_path="...", device="cuda")
+   ```
+
+2. **Lower inference steps** for lower latency
+   ```python
+   tts = VibeVoiceStreamingTTS(model_path="...", inference_steps=5)
+   ```
+
+3. **Use real-time streaming** for lowest latency
+   ```python
+   player.play_stream(audio, realtime=True)
+   ```
+
+4. **Prebuffer for smoother playback**
+   - Real-time mode prebuffers 100ms automatically
+   - Buffered mode collects all audio first
+
+---
+
+## Troubleshooting
+
+### No audio output
+
+```python
+# Check available devices
+AudioPlayer.list_devices()
+
+# Try default device
+player = AudioPlayer(device_id=None)
+```
+
+### CUDA out of memory
+
+```python
+# Use CPU instead
+tts = VibeVoiceStreamingTTS(model_path="...", device="cpu")
+```
+
+### Import errors
+
+```bash
+# Reinstall VibeVoice
+pip install -e /path/to/VibeVoice
+
+# Install sounddevice for audio playback
+pip install sounddevice
+```
+
+### Distorted audio
+
+Audio is automatically normalized and clipped to [-1.0, 1.0] range. If you still hear distortion, try:
+
+```python
+# Lower CFG scale
+audio = tts.text_to_speech_streaming(text_gen(), cfg_scale=1.0)
+```
+
+---
+
+## License
+
+See the [LICENSE](../LICENSE) file for details.
diff --git a/examples/simple_inference.py b/examples/simple_inference.py
new file mode 100644
index 00000000..a3354cf7
--- /dev/null
+++ b/examples/simple_inference.py
@@ -0,0 +1,66 @@
+"""
+Simple VibeVoice Inference Example
+
+This script demonstrates basic usage of the VibeVoice Python API.
+
+Run from VibeVoice root:
+    python examples/simple_inference.py
+"""
+
+from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
+
+
+def main():
+    print("="*60)
+    print("VibeVoice Simple Inference Example")
+    print("="*60)
+    print()
+
+    # Configuration
+    MODEL_PATH = "microsoft/VibeVoice-Realtime-0.5B"
+    VOICE_PROMPT_PATH = "demo/voices/streaming_model/en-Emma_woman.pt"  # Optional
+    DEVICE = "cuda"  # or "cpu" or "mps"
+
+    # Initialize TTS
+    print("Initializing VibeVoice...")
+    tts = VibeVoiceStreamingTTS(
+        model_path=MODEL_PATH,
+        voice_prompt_path=VOICE_PROMPT_PATH,
+        device=DEVICE,
+        inference_steps=5  # Fast inference
+    )
+    print()
+
+    # Initialize audio player
+    print("Initializing audio player...")
+    player = AudioPlayer()
+    print()
+
+    # List available devices
+    print("Available audio devices:")
+    AudioPlayer.list_devices()
+    print()
+
+    # Generate text
+    def text_generator():
+        """Simple text generator"""
+        text = "Hello! This is VibeVoice speaking. I can generate speech in real time."
+        for word in text.split():
+            yield word
+
+    # Generate and play
+    print("Generating and playing speech...")
+    print("Text: 'Hello! This is VibeVoice speaking. I can generate speech in real time.'")
+    print()
+
+    audio_stream = tts.text_to_speech_streaming(text_generator())
+    player.play_stream(audio_stream, realtime=True)
+
+    print()
+    print("="*60)
+    print("Done!")
+    print("="*60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vibevoice/__init__.py b/vibevoice/__init__.py
index e69de29b..258c5779 100644
--- a/vibevoice/__init__.py
+++ b/vibevoice/__init__.py
@@ -0,0 +1,67 @@
+"""
+VibeVoice - Real-time Streaming Text-to-Speech
+
+A high-quality, low-latency text-to-speech system with streaming support.
+
+High-Level API (Recommended):
+    - VibeVoiceStreamingTTS: Easy-to-use TTS with streaming
+    - AudioPlayer: Audio playback with speaker selection
+
+Low-Level API (Advanced):
+    - VibeVoiceStreamingForConditionalGenerationInference: Core TTS model
+    - VibeVoiceStreamingProcessor: Text and audio processor
+    - AudioStreamer: Low-level audio streaming
+
+Quick Start:
+    >>> from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
+    >>>
+    >>> # Initialize TTS
+    >>> tts = VibeVoiceStreamingTTS(
+    ...     model_path="microsoft/VibeVoice-Realtime-0.5B",
+    ...     device="cuda"
+    ... )
+    >>>
+    >>> # Generate and play
+    >>> player = AudioPlayer()
+    >>> def text_gen():
+    ...     for word in ["Hello", "world"]:
+    ...         yield word
+    >>>
+    >>> audio_stream = tts.text_to_speech_streaming(text_gen())
+    >>> player.play_stream(audio_stream, realtime=True)
+"""
+
+# High-level API
+from .inference import (
+    VibeVoiceStreamingTTS,
+    AudioPlayer,
+    synthesize_speech,
+    list_default_voices
+)
+
+# Low-level API
+from .modular.modeling_vibevoice_streaming_inference import (
+    VibeVoiceStreamingForConditionalGenerationInference
+)
+from .processor.vibevoice_streaming_processor import (
+    VibeVoiceStreamingProcessor
+)
+from .modular.streamer import (
+    AudioStreamer,
+    AsyncAudioStreamer
+)
+
+__all__ = [
+    # High-level API
+    'VibeVoiceStreamingTTS',
+    'AudioPlayer',
+    'synthesize_speech',
+    'list_default_voices',
+    # Low-level API
+    'VibeVoiceStreamingForConditionalGenerationInference',
+    'VibeVoiceStreamingProcessor',
+    'AudioStreamer',
+    'AsyncAudioStreamer',
+]
+
+__version__ = '0.0.1'
diff --git a/vibevoice/inference.py b/vibevoice/inference.py
new file mode 100644
index 00000000..c09b7eb2
--- /dev/null
+++ b/vibevoice/inference.py
@@ -0,0 +1,583 @@
+"""
+VibeVoice Python Inference Module
+
+High-level API for easy text-to-speech inference with streaming support.
+"""
+
+import copy
+from pathlib import Path
+from typing import Iterator, Generator, Optional
+from threading import Thread, Lock
+
+import torch
+import numpy as np
+
+try:
+    import sounddevice as sd
+    SOUNDDEVICE_AVAILABLE = True
+except ImportError:
+    SOUNDDEVICE_AVAILABLE = False
+    print("Warning: sounddevice not installed. Audio playback features will be disabled.")
+    print("Install with: pip install sounddevice")
+
+from .modular.streamer import AudioStreamer
+from .modular.modeling_vibevoice_streaming_inference import (
+    VibeVoiceStreamingForConditionalGenerationInference
+)
+from .processor.vibevoice_streaming_processor import VibeVoiceStreamingProcessor
+
+
+class VibeVoiceStreamingTTS:
+    """
+    High-level wrapper for VibeVoice streaming text-to-speech.
+
+    This class provides an easy-to-use interface for real-time TTS generation
+    with support for voice cloning and streaming output.
+
+    Example:
+        >>> tts = VibeVoiceStreamingTTS(
+        ...     model_path="microsoft/VibeVoice-Realtime-0.5B",
+        ...     voice_prompt_path="path/to/voice.pt",
+        ...     device="cuda"
+        ... )
+        >>>
+        >>> def text_gen():
+        ...     for word in ["Hello", "world"]:
+        ...         yield word
+        >>>
+        >>> for audio_chunk in tts.text_to_speech_streaming(text_gen()):
+        ...     # Process audio chunk
+        ...     pass
+    """
+
+    def __init__(
+        self,
+        model_path: str,
+        voice_prompt_path: Optional[str] = None,
+        device: str = "cuda",
+        inference_steps: int = 5,
+    ):
+        """
+        Initialize VibeVoice streaming TTS.
+
+        Args:
+            model_path: Path to VibeVoice model or HuggingFace model ID
+            voice_prompt_path: Optional path to voice prompt (.pt file) for voice cloning.
+                              If None, will automatically use a default voice from demo/voices/streaming_model/
+            device: Device to run on ('cuda', 'mps', or 'cpu')
+            inference_steps: Number of diffusion inference steps (lower = faster, higher = better quality)
+        """
+        print(f"Loading VibeVoice model from {model_path}...")
+
+        # Load processor
+        self.processor = VibeVoiceStreamingProcessor.from_pretrained(model_path)
+
+        # Determine dtype and attention implementation
+        if device == "cuda":
+            dtype = torch.bfloat16
+            attn_impl = "flash_attention_2"
+            device_map = "cuda"
+        elif device == "mps":
+            dtype = torch.float32
+            attn_impl = "sdpa"
+            device_map = None
+        else:
+            dtype = torch.float32
+            attn_impl = "sdpa"
+            device_map = "cpu"
+
+        # Load model
+        self.model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
+            model_path,
+            torch_dtype=dtype,
+            device_map=device_map,
+            attn_implementation=attn_impl
+        )
+
+        if device == "mps":
+            self.model.to("mps")
+
+        self.model.eval()
+        self.model.set_ddpm_inference_steps(num_steps=inference_steps)
+
+        # Load voice prompt
+        self.voice_prompt = None
+        if voice_prompt_path and Path(voice_prompt_path).exists():
+            print(f"Loading voice prompt from {voice_prompt_path}")
+            self.voice_prompt = torch.load(
+                voice_prompt_path,
+                map_location=device,
+                weights_only=False
+            )
+        else:
+            # Try to find default voice prompts
+            default_voice_dir = Path(__file__).parent.parent / "demo" / "voices" / "streaming_model"
+            if default_voice_dir.exists():
+                # Look for a default voice (prefer en-Mike_man.pt or first available)
+                default_voices = list(default_voice_dir.glob("*.pt"))
+                if default_voices:
+                    # Prefer en-Mike_man.pt if available
+                    preferred = default_voice_dir / "en-Mike_man.pt"
+                    voice_path = preferred if preferred.exists() else default_voices[0]
+                    print(f"Loading default voice prompt from {voice_path.name}")
+                    self.voice_prompt = torch.load(
+                        voice_path,
+                        map_location=device,
+                        weights_only=False
+                    )
+
+        if self.voice_prompt is None:
+            raise RuntimeError(
+                "No voice prompt provided and no default voices found. "
+                "Please provide a voice_prompt_path or ensure demo/voices/streaming_model/*.pt exists."
+            )
+
+        self.device = device
+        self.sample_rate = 24000
+        print("Model loaded successfully!")
+
+    def text_to_speech_streaming(
+        self,
+        text_iterator: Iterator[str],
+        cfg_scale: float = 1.5,
+    ) -> Iterator[np.ndarray]:
+        """
+        Convert text from an iterator to speech chunks in real-time.
+
+        Args:
+            text_iterator: Iterator/generator that yields text tokens/chunks
+            cfg_scale: Classifier-free guidance scale (1.0-2.0, higher = better quality)
+
+        Yields:
+            numpy arrays containing audio chunks (float32, 1D, normalized to [-1.0, 1.0])
+
+        Example:
+            >>> def text_gen():
+            ...     for word in ["Hello", "world"]:
+            ...         yield word
+            >>>
+            >>> for audio_chunk in tts.text_to_speech_streaming(text_gen()):
+            ...     print(f"Received chunk with {len(audio_chunk)} samples")
+        """
+        # Collect text from iterator
+        text_chunks = list(text_iterator)
+        full_text = " ".join(text_chunks)
+
+        if not full_text.strip():
+            return
+
+        print(f"Generating speech for: '{full_text}'")
+
+        # Create audio streamer
+        audio_streamer = AudioStreamer(batch_size=1)
+
+        # Process input
+        inputs = self.processor.process_input_with_cached_prompt(
+            text=full_text,
+            cached_prompt=self.voice_prompt,
+            padding=True,
+            return_tensors="pt",
+        )
+
+        # Move inputs to device
+        inputs = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v
+                  for k, v in inputs.items()}
+
+        # Start generation in background thread for real-time streaming
+        def run_generation():
+            with torch.no_grad():
+                self.model.generate(
+                    **inputs,
+                    audio_streamer=audio_streamer,
+                    cfg_scale=cfg_scale,
+                    tokenizer=self.processor.tokenizer,
+                    generation_config={'do_sample': False},
+                    all_prefilled_outputs=copy.deepcopy(self.voice_prompt),
+                )
+
+        generation_thread = Thread(target=run_generation, daemon=True)
+        generation_thread.start()
+
+        # Yield audio chunks as they arrive from the model
+        stream = audio_streamer.get_stream(0)
+        for audio_chunk in stream:
+            # Convert to numpy array (float32 for compatibility)
+            if torch.is_tensor(audio_chunk):
+                audio_chunk = audio_chunk.detach().cpu().to(torch.float32).numpy()
+            else:
+                audio_chunk = np.asarray(audio_chunk, dtype=np.float32)
+
+            # Reshape to 1D if needed
+            if audio_chunk.ndim > 1:
+                audio_chunk = audio_chunk.reshape(-1)
+
+            # Normalize if peak is above 1.0 to prevent distortion
+            peak = np.max(np.abs(audio_chunk)) if audio_chunk.size else 0.0
+            if peak > 1.0:
+                audio_chunk = audio_chunk / peak
+
+            # Clip to valid range [-1.0, 1.0]
+            audio_chunk = np.clip(audio_chunk, -1.0, 1.0)
+
+            yield audio_chunk.astype(np.float32, copy=False)
+
+        # Wait for generation to complete
+        generation_thread.join()
+
+    def save_audio(self, audio: np.ndarray, output_path: str):
+        """
+        Save generated audio to a WAV file.
+
+        Args:
+            audio: Audio data as numpy array
+            output_path: Path to save the WAV file
+
+        Example:
+            >>> chunks = list(tts.text_to_speech_streaming(text_gen()))
+            >>> full_audio = np.concatenate(chunks)
+            >>> tts.save_audio(full_audio, "output.wav")
+        """
+        self.processor.save_audio(
+            audio,
+            output_path=output_path,
+            sampling_rate=self.sample_rate
+        )
+
+
+class AudioPlayer:
+    """
+    Audio player with speaker selection support.
+
+    Provides easy playback of audio streams with automatic device management
+    and real-time streaming support.
+
+    Example:
+        >>> player = AudioPlayer()
+        >>> audio_stream = tts.text_to_speech_streaming(text_gen())
+        >>> player.play_stream(audio_stream, realtime=True)
+    """
+
+    def __init__(self, device_id: Optional[int] = None, sample_rate: int = 24000):
+        """
+        Initialize audio player.
+
+        Args:
+            device_id: Speaker device ID (None for default)
+            sample_rate: Audio sample rate in Hz (default 24000)
+        """
+        if not SOUNDDEVICE_AVAILABLE:
+            raise ImportError(
+                "sounddevice is required for audio playback. "
+                "Install with: pip install sounddevice"
+            )
+
+        self.device_id = device_id
+        self.sample_rate = sample_rate
+
+    @staticmethod
+    def list_devices(show_all: bool = False):
+        """
+        List available audio output devices.
+
+        Args:
+            show_all: If True, show all devices including duplicates.
+                     If False, show only unique output devices.
+
+        Example:
+            >>> AudioPlayer.list_devices()
+            Available Audio Output Devices:
+            [3] Microsoft Sound Mapper - Output ⭐ DEFAULT
+            [4] Speakers (USB Audio Device)
+        """
+        if not SOUNDDEVICE_AVAILABLE:
+            print("sounddevice not installed. Cannot list devices.")
+            return []
+
+        print("\nAvailable Audio Output Devices:")
+        print("-" * 60)
+        devices = sd.query_devices()
+        default_output = sd.default.device[1]
+
+        if show_all:
+            for i, device in enumerate(devices):
+                if device['max_output_channels'] > 0:
+                    default_marker = " (DEFAULT)" if i == default_output else ""
+                    print(f"[{i}] {device['name']}{default_marker}")
+                    print(f"    Channels: {device['max_output_channels']}")
+                    print(f"    Sample Rate: {device['default_samplerate']} Hz")
+                    print()
+        else:
+            seen_names = set()
+            output_devices = []
+
+            for i, device in enumerate(devices):
+                if device['max_output_channels'] > 0:
+                    name = device['name']
+                    if name not in seen_names:
+                        seen_names.add(name)
+                        is_default = (i == default_output)
+                        output_devices.append((i, name, is_default))
+
+            for i, name, is_default in output_devices:
+                default_marker = " ⭐ DEFAULT" if is_default else ""
+                print(f"[{i}] {name}{default_marker}")
+
+            print()
+            print(f"Default device ID: {default_output}")
+            print("Tip: Use device_id=None to use the default device")
+            print()
+
+        return devices
+
+    @staticmethod
+    def get_default_output_device():
+        """
+        Get the default output device ID.
+
+        Returns:
+            int: Default output device ID
+
+        Example:
+            >>> device_id = AudioPlayer.get_default_output_device()
+            >>> player = AudioPlayer(device_id=device_id)
+        """
+        if not SOUNDDEVICE_AVAILABLE:
+            return None
+        return sd.default.device[1]
+
+    def play_stream(self, audio_iterator: Iterator[np.ndarray], realtime: bool = True):
+        """
+        Play audio from an iterator of chunks.
+
+        Args:
+            audio_iterator: Iterator yielding audio chunks (numpy arrays)
+            realtime: If True, use streaming mode with minimal buffering (~100ms latency).
+                     If False, collect all chunks first for smooth playback.
+
+        Example:
+            >>> # Real-time streaming (low latency)
+            >>> player.play_stream(audio_stream, realtime=True)
+            >>>
+            >>> # Buffered playback (smooth, no gaps)
+            >>> player.play_stream(audio_stream, realtime=False)
+        """
+        if realtime:
+            # Real-time streaming with callback-based continuous playback
+            PREBUFFER_SECONDS = 0.1  # 100ms prebuffer
+            BLOCKSIZE = 2048          # ~85ms chunks at 24kHz
+
+            prebuffer_samples = int(self.sample_rate * PREBUFFER_SECONDS)
+
+            buffer = np.array([], dtype=np.float32)
+            buffer_lock = Lock()
+            iterator_finished = False
+            has_started = False
+
+            def fill_buffer():
+                nonlocal buffer, iterator_finished
+                for audio_chunk in audio_iterator:
+                    with buffer_lock:
+                        buffer = np.concatenate([buffer, audio_chunk])
+                iterator_finished = True
+
+            fill_thread = Thread(target=fill_buffer, daemon=True)
+            fill_thread.start()
+
+            def audio_callback(outdata, frames, time_info, status):
+                nonlocal buffer, has_started
+
+                if status:
+                    print(f"Audio callback status: {status}", flush=True)
+
+                with buffer_lock:
+                    if not has_started:
+                        if len(buffer) >= prebuffer_samples or iterator_finished:
+                            has_started = True
+                            print("Starting playback (prebuffer ready)...", flush=True)
+                        else:
+                            outdata.fill(0)
+                            return
+
+                    if len(buffer) >= frames:
+                        outdata[:] = buffer[:frames].reshape(-1, 1)
+                        buffer = buffer[frames:]
+                    elif len(buffer) > 0:
+                        outdata[:len(buffer)] = buffer.reshape(-1, 1)
+                        outdata[len(buffer):] = 0
+                        buffer = np.array([], dtype=np.float32)
+                    else:
+                        outdata.fill(0)
+
+            try:
+                with sd.OutputStream(
+                    samplerate=self.sample_rate,
+                    blocksize=BLOCKSIZE,
+                    device=self.device_id,
+                    channels=1,
+                    dtype='float32',
+                    callback=audio_callback
+                ):
+                    print("Audio stream started...", flush=True)
+                    fill_thread.join()
+
+                    while True:
+                        with buffer_lock:
+                            if len(buffer) == 0 and iterator_finished:
+                                break
+                        sd.sleep(100)
+
+                    sd.sleep(200)  # Final audio drain
+
+            except Exception as e:
+                print(f"Error during audio streaming: {e}", flush=True)
+                raise
+
+            print("Playback completed!", flush=True)
+
+        else:
+            # Buffered playback
+            chunks = []
+            print("Collecting audio chunks...", end="", flush=True)
+            for audio_chunk in audio_iterator:
+                chunks.append(audio_chunk)
+                print(".", end="", flush=True)
+            print(" Done!")
+
+            if chunks:
+                print("Playing audio...")
+                full_audio = np.concatenate(chunks)
+                sd.play(full_audio, samplerate=self.sample_rate, device=self.device_id)
+                sd.wait()
+
+    def stop(self):
+        """Stop current playback."""
+        if SOUNDDEVICE_AVAILABLE:
+            sd.stop()
+
+
+def list_default_voices() -> list[str]:
+    """
+    List available default voice prompts.
+
+    Returns:
+        List of available voice names (without .pt extension)
+
+    Example:
+        >>> from vibevoice import list_default_voices
+        >>> voices = list_default_voices()
+        >>> print(f"Available voices: {', '.join(voices)}")
+    """
+    default_voice_dir = Path(__file__).parent.parent / "demo" / "voices" / "streaming_model"
+    if not default_voice_dir.exists():
+        return []
+
+    voice_files = sorted(default_voice_dir.glob("*.pt"))
+    return [v.stem for v in voice_files]
+
+
+def synthesize_speech(
+    text: Iterator[str] | str,
+    model_path: str = "microsoft/VibeVoice-Realtime-0.5B",
+    voice_prompt_path: Optional[str] = None,
+    device: str = "cuda",
+    output_file: Optional[str] = None,
+    play_audio: bool = True,
+    speaker_device_id: Optional[int] = None,
+    inference_steps: int = 5,
+    cfg_scale: float = 1.5,
+    realtime: bool = True,
+) -> Optional[np.ndarray]:
+    """
+    High-level function to synthesize speech from text.
+
+    This is a convenience function that handles model loading, generation,
+    and playback in a single call.
+
+    Args:
+        text: Text to synthesize (string or iterator of strings)
+        model_path: Path to VibeVoice model or HuggingFace model ID
+        voice_prompt_path: Optional path to voice prompt for voice cloning.
+                          If None, will automatically use a default voice.
+        device: Device to run on ('cuda', 'mps', 'cpu')
+        output_file: Optional path to save audio to file
+        play_audio: If True, play audio through speakers
+        speaker_device_id: Speaker device ID (None for default)
+        inference_steps: Number of diffusion steps (5=fast, 50=quality)
+        cfg_scale: Classifier-free guidance scale (1.0-2.0)
+        realtime: If True, use streaming playback mode
+
+    Returns:
+        np.ndarray: Generated audio if output_file is specified, else None
+
+    Example:
+        >>> # Simple usage
+        >>> synthesize_speech("Hello world!")
+        >>>
+        >>> # Save to file
+        >>> synthesize_speech("Hello world!", output_file="output.wav")
+        >>>
+        >>> # Voice cloning
+        >>> synthesize_speech(
+        ...     "Hello from my cloned voice",
+        ...     voice_prompt_path="voices/speaker.pt"
+        ... )
+        >>>
+        >>> # High quality
+        >>> synthesize_speech(
+        ...     "High quality speech",
+        ...     inference_steps=50,
+        ...     cfg_scale=2.0
+        ... )
+    """
+    # Initialize TTS
+    print(f"Loading model from {model_path}...")
+    tts = VibeVoiceStreamingTTS(
+        model_path=model_path,
+        voice_prompt_path=voice_prompt_path,
+        device=device,
+        inference_steps=inference_steps,
+    )
+
+    # Simple text generator
+    def text_gen():
+        if isinstance(text, str):
+            yield text
+        else:
+            for chunk in text:
+                yield chunk
+
+    # Generate audio
+    print(f"Generating speech for: '{text}'")
+    audio_stream = tts.text_to_speech_streaming(text_gen(), cfg_scale=cfg_scale)
+
+    # Collect chunks if we need to save
+    if output_file or not play_audio:
+        chunks = []
+        for chunk in audio_stream:
+            chunks.append(chunk)
+        full_audio = np.concatenate(chunks)
+
+        if output_file:
+            print(f"Saving audio to {output_file}...")
+            tts.save_audio(full_audio, output_file)
+
+        if play_audio and SOUNDDEVICE_AVAILABLE:
+            print("Playing audio...")
+            player = AudioPlayer(device_id=speaker_device_id)
+            sd.play(full_audio, samplerate=tts.sample_rate, device=speaker_device_id)
+            sd.wait()
+
+        return full_audio if output_file else None
+
+    # Stream and play directly
+    if play_audio:
+        if not SOUNDDEVICE_AVAILABLE:
+            print("Warning: sounddevice not available, cannot play audio")
+            return None
+
+        print("Playing audio in real-time...")
+        player = AudioPlayer(device_id=speaker_device_id)
+        player.play_stream(audio_stream, realtime=realtime)
+    full_audio = [chunk for chunk in audio_stream]
+
+    return full_audio if output_file else None

From 6d879f6c445f137395909c703d3d49c4acba82f6 Mon Sep 17 00:00:00 2001
From: Himansh raj <iamthehimansh@gmail.com>
Date: Tue, 9 Dec 2025 23:41:27 +0530
Subject: [PATCH 2/2] made docs concise  and removed extra one

---
 PYTHON_API.md            | 230 ----------------
 docs/python_inference.md | 563 ++++++---------------------------------
 2 files changed, 77 insertions(+), 716 deletions(-)
 delete mode 100644 PYTHON_API.md

diff --git a/PYTHON_API.md b/PYTHON_API.md
deleted file mode 100644
index e02f2cf0..00000000
--- a/PYTHON_API.md
+++ /dev/null
@@ -1,230 +0,0 @@
-# VibeVoice Python API
-
-Easy-to-use Python API for real-time text-to-speech with VibeVoice.
-
-## Quick Start
-
-### One-Line Synthesis (Easiest!)
-
-```python
-from vibevoice import synthesize_speech
-
-# Simplest possible - automatically uses default voice!
-synthesize_speech("Hello world!", device="cuda")
-```
-
-### Class-Based API
-
-```python
-from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
-
-# Initialize TTS (automatically loads default voice)
-tts = VibeVoiceStreamingTTS(
-    model_path="microsoft/VibeVoice-Realtime-0.5B",
-    device="cuda"
-)
-
-# Initialize audio player
-player = AudioPlayer()
-
-# Generate text
-def text_gen():
-    for word in ["Hello", "world"]:
-        yield word
-
-# Generate and play
-audio = tts.text_to_speech_streaming(text_gen())
-player.play_stream(audio, realtime=True)
-```
-
-## Installation
-
-```bash
-# Install VibeVoice
-pip install -e .
-
-# Install audio playback support
-pip install sounddevice
-```
-
-## Features
-
-- ✅ **One-line synthesis** - `synthesize_speech("Hello!")`
-- ✅ **Automatic voice loading** - 7 default voices included, no setup needed!
-- ✅ **Real-time streaming** - ~100ms latency
-- ✅ **Voice cloning** - Use voice prompts for speaker cloning
-- ✅ **Speaker selection** - Choose output device
-- ✅ **Easy-to-use API** - Simple high-level interface
-- ✅ **GPU acceleration** - CUDA, Apple Silicon (MPS), CPU support
-- ✅ **Iterator-based** - Works with LLM token streams
-
-## Documentation
-
-- **[Python Inference Guide](docs/python_inference.md)** - Complete API reference
-- **[Examples](examples/)** - Code examples
-
-## API Overview
-
-### High-Level Functions
-
-```python
-from vibevoice import synthesize_speech, list_default_voices
-
-# One-line synthesis (easiest!)
-synthesize_speech("Hello world!", device="cuda")
-
-# List available default voices
-voices = list_default_voices()
-# ['en-Mike_man', 'en-Emma_woman', 'en-Carter_man', ...]
-
-# With iterator (LLM streaming)
-def text_gen():
-    for word in ["Hello", "world"]:
-        yield word
-synthesize_speech(text_gen(), device="cuda")
-```
-
-### Class-Based API
-
-```python
-from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
-
-# TTS with streaming (automatically loads default voice)
-tts = VibeVoiceStreamingTTS(
-    model_path="microsoft/VibeVoice-Realtime-0.5B",
-    voice_prompt_path="path/to/voice.pt",  # Optional - uses default if None
-    device="cuda",
-    inference_steps=5
-)
-
-# Audio player with device selection
-player = AudioPlayer(device_id=None)  # None = default device
-player.play_stream(audio_iterator, realtime=True)
-```
-
-### Low-Level API
-
-```python
-from vibevoice import (
-    VibeVoiceStreamingForConditionalGenerationInference,
-    VibeVoiceStreamingProcessor,
-    AudioStreamer
-)
-
-# Direct model access for advanced users
-processor = VibeVoiceStreamingProcessor.from_pretrained(model_path)
-model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
-    model_path,
-    torch_dtype=torch.bfloat16,
-    device_map="cuda"
-)
-```
-
-## Examples
-
-### One-Line Synthesis
-
-```python
-from vibevoice import synthesize_speech
-
-# Simplest possible
-synthesize_speech("Hello world!", device="cuda")
-
-# With iterator
-def text_gen():
-    for word in ["Hello", "world"]:
-        yield word
-synthesize_speech(text_gen(), device="cuda")
-```
-
-### Basic TTS with Classes
-
-```python
-from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
-
-tts = VibeVoiceStreamingTTS("microsoft/VibeVoice-Realtime-0.5B", device="cuda")
-player = AudioPlayer()
-
-def text_gen():
-    yield "Hello world!"
-
-audio = tts.text_to_speech_streaming(text_gen())
-player.play_stream(audio, realtime=True)
-```
-
-### Save to File
-
-```python
-import numpy as np
-from vibevoice import VibeVoiceStreamingTTS
-
-tts = VibeVoiceStreamingTTS("microsoft/VibeVoice-Realtime-0.5B", device="cuda")
-
-chunks = list(tts.text_to_speech_streaming(text_gen()))
-full_audio = np.concatenate(chunks)
-tts.save_audio(full_audio, "output.wav")
-```
-
-### Voice Cloning
-
-```python
-from vibevoice import VibeVoiceStreamingTTS
-
-tts = VibeVoiceStreamingTTS(
-    model_path="microsoft/VibeVoice-Realtime-0.5B",
-    voice_prompt_path="voices/speaker.pt",  # Speaker embedding
-    device="cuda"
-)
-
-audio = tts.text_to_speech_streaming(text_gen())
-player.play_stream(audio, realtime=True)
-```
-
-### LLM Integration
-
-```python
-from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
-
-tts = VibeVoiceStreamingTTS("microsoft/VibeVoice-Realtime-0.5B", device="cuda")
-player = AudioPlayer()
-
-def llm_stream():
-    """Your LLM generates tokens here"""
-    for token in llm.generate():
-        yield token
-
-# Real-time TTS as LLM generates
-audio = tts.text_to_speech_streaming(llm_stream())
-player.play_stream(audio, realtime=True)
-```
-
-## Performance
-
-- **First audio chunk**: ~100-300ms (CUDA)
-- **Audio quality**: 24kHz sample rate
-- **Devices**: CUDA (fastest), MPS (Apple), CPU (slower)
-- **Inference steps**: 5 (fast) to 50 (high quality)
-
-## Requirements
-
-- Python >= 3.9
-- PyTorch >= 2.0
-- sounddevice (for audio playback)
-- CUDA toolkit (optional, for GPU)
-
-## License
-
-See [LICENSE](LICENSE) for details.
-
-## Citation
-
-If you use VibeVoice in your research, please cite:
-
-```bibtex
-@article{vibevoice2025,
-  title={VibeVoice: Real-time Streaming Text-to-Speech},
-  author={VibeVoice Team},
-  journal={Microsoft Research},
-  year={2025}
-}
-```
diff --git a/docs/python_inference.md b/docs/python_inference.md
index ce8b1ea5..39f589d2 100644
--- a/docs/python_inference.md
+++ b/docs/python_inference.md
@@ -1,81 +1,28 @@
 # VibeVoice Python Inference Guide
 
-Complete guide for using VibeVoice text-to-speech in Python with streaming support.
+Complete API reference for VibeVoice text-to-speech.
 
 ## Table of Contents
 
-- [Installation](#installation)
 - [Quick Start](#quick-start)
 - [API Reference](#api-reference)
-  - [synthesize_speech()](#synthesize_speech) - High-level function (easiest!)
-  - [list_default_voices()](#list_default_voices) - List available voices
+  - [synthesize_speech()](#synthesize_speech)
+  - [list_default_voices()](#list_default_voices)
   - [VibeVoiceStreamingTTS](#vibevoicestreamingtts)
   - [AudioPlayer](#audioplayer)
-- [Advanced Usage](#advanced-usage)
-- [Examples](#examples)
-
----
-
-## Installation
-
-```bash
-# Install VibeVoice
-pip install -e /path/to/VibeVoice
-
-# Install audio playback support (optional, required for AudioPlayer)
-pip install sounddevice
-```
 
 ---
 
 ## Quick Start
 
-### One-Line Synthesis (Easiest!)
-
 ```python
 from vibevoice import synthesize_speech
 
-# Simplest possible - automatically uses default voice!
-synthesize_speech("Hello world!", device="cuda")
-```
-
-### Basic Text-to-Speech
-
-```python
-from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
-
-# Initialize TTS (automatically loads default voice)
-tts = VibeVoiceStreamingTTS(
-    model_path="microsoft/VibeVoice-Realtime-0.5B",
-    device="cuda"  # or "cpu" or "mps"
-)
-
-# Initialize audio player
-player = AudioPlayer()
-
-# Generate text
-def text_generator():
-    for word in ["Hello", "world", "from", "VibeVoice"]:
-        yield word
+# Simplest
+synthesize_speech("Hello world!")
 
-# Generate and play audio in real-time
-audio_stream = tts.text_to_speech_streaming(text_generator())
-player.play_stream(audio_stream, realtime=True)
-```
-
-### With Voice Cloning
-
-```python
-# Load with voice prompt for voice cloning
-tts = VibeVoiceStreamingTTS(
-    model_path="microsoft/VibeVoice-Realtime-0.5B",
-    voice_prompt_path="path/to/voice.pt",  # Speaker embedding
-    device="cuda"
-)
-
-# Generate with cloned voice
-audio_stream = tts.text_to_speech_streaming(text_generator())
-player.play_stream(audio_stream, realtime=True)
+# With device
+synthesize_speech(text="Hello world!", device="cuda")
 ```
 
 ---
@@ -84,521 +31,165 @@ player.play_stream(audio_stream, realtime=True)
 
 ### synthesize_speech()
 
-**Easiest way to use VibeVoice!** One-line function for text-to-speech synthesis.
+One-line function for text-to-speech.
 
 ```python
 synthesize_speech(
     text: str | Iterator[str],
-    model_path: str = "microsoft/VibeVoice-Realtime-0.5B",
-    voice_prompt_path: Optional[str] = None,
     device: str = "cuda",
-    output_file: Optional[str] = None,
-    play_audio: bool = True,
-    speaker_device_id: Optional[int] = None,
+    output_file: str = None,
+    voice_prompt_path: str = None,
     inference_steps: int = 5,
     cfg_scale: float = 1.5,
-    realtime: bool = True
-) -> Optional[np.ndarray]
+    **kwargs
+)
 ```
 
-**Parameters:**
-
-- `text` (str or Iterator[str]): Text to synthesize or iterator yielding text chunks
-- `model_path` (str): HuggingFace model ID (default: "microsoft/VibeVoice-Realtime-0.5B")
-- `voice_prompt_path` (str, optional): Custom voice prompt path. If None, uses default voice.
-- `device` (str): Device ("cuda", "mps", "cpu")
-- `output_file` (str, optional): Path to save WAV file
-- `play_audio` (bool): Whether to play audio (default: True)
-- `speaker_device_id` (int, optional): Speaker device ID (None for default)
-- `inference_steps` (int): Diffusion steps (5=fast, 50=quality)
-- `cfg_scale` (float): Guidance scale (1.0-2.0)
-- `realtime` (bool): Use streaming playback (default: True)
+**Key Parameters:**
 
-**Returns:**
-- `np.ndarray` or `None`: Audio array if `output_file` specified, else None
+- `text` - Text or iterator
+- `device` - "cuda", "mps", or "cpu"
+- `output_file` - Save path (optional)
+- `inference_steps` - 5 (fast) to 50 (quality)
+- `cfg_scale` - 1.0-2.0 (quality)
 
 **Examples:**
 
 ```python
-from vibevoice import synthesize_speech
+# Basic
+synthesize_speech(text="Hello", device="cuda")
 
-# Simple usage
-synthesize_speech("Hello world!", device="cuda")
+# Iterator (LLM streaming)
+synthesize_speech(text=["Hello", "world"], device="cuda")
 
-# With iterator (LLM streaming)
-def text_gen():
-    for word in ["Hello", "streaming", "world"]:
-        yield word
-synthesize_speech(text_gen(), device="cuda")
-
-# Save to file
-synthesize_speech("Save this", output_file="output.wav", device="cuda")
+# Save file
+synthesize_speech(text="Hello", device="cuda", output_file="out.wav")
 
 # Custom voice
 synthesize_speech(
-    "Custom voice",
-    voice_prompt_path="voices/custom.pt",
-    device="cuda"
+    text="Hello",
+    device="cuda",
+    voice_prompt_path="voices/custom.pt"
 )
 
 # High quality
-synthesize_speech(
-    "High quality",
-    inference_steps=50,
-    cfg_scale=2.0,
-    device="cuda"
-)
+synthesize_speech(text="Hello", device="cuda", inference_steps=50, cfg_scale=2.0)
 ```
 
 ---
 
 ### list_default_voices()
 
-List available default voice prompts included with VibeVoice.
-
-```python
-list_default_voices() -> list[str]
-```
-
-**Returns:**
-- `list[str]`: List of available voice names (without .pt extension)
-
-**Example:**
+List available voice presets.
 
 ```python
-from vibevoice import list_default_voices
-
 voices = list_default_voices()
-print(voices)
-# ['en-Carter_man', 'en-Davis_man', 'en-Emma_woman', 'en-Frank_man',
-#  'en-Grace_woman', 'en-Mike_man', 'in-Samuel_man']
-
-# Use a specific default voice
-voice_path = f"demo/voices/streaming_model/{voices[2]}.pt"  # en-Emma_woman
-synthesize_speech("Hello", voice_prompt_path=voice_path)
+# Returns: ['en-Carter_man', 'en-Davis_man', 'en-Emma_woman', ...]
 ```
 
 ---
 
 ### VibeVoiceStreamingTTS
 
-High-level wrapper for VibeVoice streaming text-to-speech.
-
-#### Constructor
-
-```python
-VibeVoiceStreamingTTS(
-    model_path: str,
-    voice_prompt_path: Optional[str] = None,
-    device: str = "cuda",
-    inference_steps: int = 5
-)
-```
-
-**Parameters:**
-
-- `model_path` (str): Path to VibeVoice model or HuggingFace model ID
-  - Example: `"microsoft/VibeVoice-Realtime-0.5B"`
-- `voice_prompt_path` (str, optional): Path to voice prompt file (.pt) for voice cloning
-  - **If None (default):** Automatically uses a default voice from `demo/voices/streaming_model/`
-  - **7 default voices available:** en-Mike_man, en-Emma_woman, en-Carter_man, en-Davis_man, en-Frank_man, en-Grace_woman, in-Samuel_man
-  - Use `list_default_voices()` to see available voices
-- `device` (str): Device to run on
-  - `"cuda"` - NVIDIA GPU (fastest, requires flash-attention-2)
-  - `"mps"` - Apple Silicon GPU
-  - `"cpu"` - CPU (slower)
-- `inference_steps` (int): Number of diffusion steps
-  - `5` - Fast, good quality (default, ~100ms latency)
-  - `50` - High quality (~500ms latency)
-
-#### Methods
-
-##### `text_to_speech_streaming(text_iterator, cfg_scale=1.5)`
-
-Generate speech from text iterator with real-time streaming.
-
-**Parameters:**
-
-- `text_iterator` (Iterator[str]): Iterator yielding text tokens/chunks
-- `cfg_scale` (float): Classifier-free guidance scale (1.0-2.0)
-  - `1.0` - Faster, lower quality
-  - `1.5` - Balanced (default)
-  - `2.0` - Better quality, slower
-
-**Returns:**
-
-- Iterator[np.ndarray]: Audio chunks as float32 numpy arrays
-
-**Example:**
-
-```python
-def text_gen():
-    for word in ["Hello", "world"]:
-        yield word
-
-for audio_chunk in tts.text_to_speech_streaming(text_gen()):
-    # audio_chunk is np.ndarray, shape (N,), dtype float32
-    # values are normalized to [-1.0, 1.0]
-    print(f"Received {len(audio_chunk)} samples")
-```
-
-##### `save_audio(audio, output_path)`
-
-Save generated audio to WAV file.
-
-**Parameters:**
-
-- `audio` (np.ndarray): Audio data
-- `output_path` (str): Path to save WAV file
-
-**Example:**
-
-```python
-import numpy as np
-
-# Collect all chunks
-chunks = list(tts.text_to_speech_streaming(text_gen()))
-
-# Concatenate and save
-full_audio = np.concatenate(chunks)
-tts.save_audio(full_audio, "output.wav")
-```
-
----
-
-### AudioPlayer
-
-Audio player with speaker selection and streaming support.
-
-#### Constructor
-
-```python
-AudioPlayer(device_id: Optional[int] = None, sample_rate: int = 24000)
-```
-
-**Parameters:**
-
-- `device_id` (int, optional): Speaker device ID (None for default)
-- `sample_rate` (int): Audio sample rate in Hz (default 24000)
-
-#### Methods
-
-##### `list_devices(show_all=False)` [static]
-
-List available audio output devices.
-
-**Parameters:**
-
-- `show_all` (bool): If True, show all devices including duplicates
-
-**Example:**
-
-```python
-AudioPlayer.list_devices()
-# Output:
-# Available Audio Output Devices:
-# [3] Microsoft Sound Mapper - Output ⭐ DEFAULT
-# [4] Speakers (USB Audio Device)
-```
-
-##### `get_default_output_device()` [static]
+High-level TTS class for advanced usage.
 
-Get default output device ID.
-
-**Returns:**
-
-- int: Default device ID
-
-**Example:**
-
-```python
-device_id = AudioPlayer.get_default_output_device()
-player = AudioPlayer(device_id=device_id)
-```
-
-##### `play_stream(audio_iterator, realtime=True)`
-
-Play audio from an iterator of chunks.
-
-**Parameters:**
-
-- `audio_iterator` (Iterator[np.ndarray]): Iterator yielding audio chunks
-- `realtime` (bool): Streaming mode
-  - `True` - Real-time streaming with ~100ms latency
-  - `False` - Buffered playback (waits for all chunks, smooth)
-
-**Example:**
-
-```python
-# Real-time streaming (low latency)
-audio_stream = tts.text_to_speech_streaming(text_gen())
-player.play_stream(audio_stream, realtime=True)
-
-# Buffered playback (smooth, no gaps)
-audio_stream = tts.text_to_speech_streaming(text_gen())
-player.play_stream(audio_stream, realtime=False)
-```
-
-##### `stop()`
-
-Stop current playback.
-
----
-
-## Advanced Usage
-
-### Select Specific Speaker
+**Constructor:**
 
 ```python
-# List available devices
-AudioPlayer.list_devices()
-
-# Use specific device
-player = AudioPlayer(device_id=4)  # Use device 4
-audio_stream = tts.text_to_speech_streaming(text_gen())
-player.play_stream(audio_stream, realtime=True)
-```
-
-### Custom Quality Settings
-
-```python
-# High quality (slower)
 tts = VibeVoiceStreamingTTS(
     model_path="microsoft/VibeVoice-Realtime-0.5B",
     device="cuda",
-    inference_steps=50  # More steps = better quality
-)
-
-audio_stream = tts.text_to_speech_streaming(
-    text_gen(),
-    cfg_scale=2.0  # Higher CFG = better quality
+    voice_prompt_path=None,  # Auto-loads default
+    inference_steps=5
 )
-player.play_stream(audio_stream, realtime=True)
 ```
 
-### Process Audio Chunks
+**Parameters:**
 
-```python
-import soundfile as sf
+- `model_path` - HuggingFace model ID
+- `device` - "cuda", "mps", "cpu"
+- `voice_prompt_path` - Voice file (optional, auto-loads if None)
+- `inference_steps` - 5-50 (speed vs quality)
 
-audio_chunks = []
-for audio_chunk in tts.text_to_speech_streaming(text_gen()):
-    # Process each chunk as it arrives
-    audio_chunks.append(audio_chunk)
+**Methods:**
 
-    # You can also apply effects here
-    # audio_chunk = apply_effects(audio_chunk)
+#### `text_to_speech_streaming(text_iterator, cfg_scale=1.5)`
 
-# Save all chunks
-full_audio = np.concatenate(audio_chunks)
-sf.write("output.wav", full_audio, tts.sample_rate)
-```
-
-### Streaming from LLM Output
+Generate speech from iterator.
 
 ```python
-def llm_token_stream():
-    """Simulate LLM generating tokens"""
-    llm_output = [
-        "The", "weather", "today", "is", "sunny", "and", "warm."
-    ]
-    for token in llm_output:
-        yield token
-
-# Convert LLM output to speech in real-time
-audio_stream = tts.text_to_speech_streaming(llm_token_stream())
-player.play_stream(audio_stream, realtime=True)
-```
-
----
-
-## Examples
-
-### Example 1: Simple TTS
-
-```python
-from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
-
-tts = VibeVoiceStreamingTTS(
-    model_path="microsoft/VibeVoice-Realtime-0.5B",
-    device="cuda"
-)
-
-player = AudioPlayer()
-
 def text_gen():
-    yield "Hello world!"
+    yield "Hello world"
 
 audio = tts.text_to_speech_streaming(text_gen())
-player.play_stream(audio, realtime=True)
+# Returns: Iterator[np.ndarray]
 ```
 
-### Example 2: Voice Cloning
+#### `save_audio(audio, output_path)`
 
-```python
-from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
-
-# Load with voice prompt
-tts = VibeVoiceStreamingTTS(
-    model_path="microsoft/VibeVoice-Realtime-0.5B",
-    voice_prompt_path="voices/emma.pt",  # Clone Emma's voice
-    device="cuda"
-)
-
-player = AudioPlayer()
-
-def text_gen():
-    yield "This is a cloned voice speaking!"
-
-audio = tts.text_to_speech_streaming(text_gen())
-player.play_stream(audio, realtime=True)
-```
-
-### Example 3: Save to File
+Save audio to WAV file.
 
 ```python
 import numpy as np
-from vibevoice import VibeVoiceStreamingTTS
-
-tts = VibeVoiceStreamingTTS(
-    model_path="microsoft/VibeVoice-Realtime-0.5B",
-    device="cuda"
-)
 
-def text_gen():
-    for sentence in ["Hello.", "How are you?", "Goodbye!"]:
-        yield sentence
-
-# Collect all chunks
 chunks = list(tts.text_to_speech_streaming(text_gen()))
-full_audio = np.concatenate(chunks)
-
-# Save to file
-tts.save_audio(full_audio, "output.wav")
-print("Audio saved to output.wav")
+audio = np.concatenate(chunks)
+tts.save_audio(audio, "output.wav")
 ```
 
-### Example 4: Multiple Speaker Devices
-
-```python
-from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
-
-# List available devices
-AudioPlayer.list_devices()
-
-# Use multiple devices
-player1 = AudioPlayer(device_id=3)  # Default speaker
-player2 = AudioPlayer(device_id=4)  # External speaker
-
-tts = VibeVoiceStreamingTTS(
-    model_path="microsoft/VibeVoice-Realtime-0.5B",
-    device="cuda"
-)
-
-def text_gen():
-    yield "Hello from VibeVoice!"
+---
 
-# Play on device 1
-audio1 = tts.text_to_speech_streaming(text_gen())
-player1.play_stream(audio1, realtime=True)
+### AudioPlayer
 
-# Play on device 2
-audio2 = tts.text_to_speech_streaming(text_gen())
-player2.play_stream(audio2, realtime=True)
-```
+Audio playback with speaker selection.
 
-### Example 5: Real-time LLM Integration
+**Constructor:**
 
 ```python
-from vibevoice import VibeVoiceStreamingTTS, AudioPlayer
-import threading
-
-tts = VibeVoiceStreamingTTS(
-    model_path="microsoft/VibeVoice-Realtime-0.5B",
-    device="cuda"
-)
-
-player = AudioPlayer()
-
-def llm_stream():
-    """Your LLM generates tokens here"""
-    tokens = [
-        "Once", "upon", "a", "time", "there", "was",
-        "a", "voice", "assistant", "that", "could", "speak."
-    ]
-    for token in tokens:
-        yield token
-
-# Generate and play simultaneously
-audio_stream = tts.text_to_speech_streaming(llm_stream())
-player.play_stream(audio_stream, realtime=True)
+player = AudioPlayer(device_id=None, sample_rate=24000)
 ```
 
----
-
-## Performance Tips
+**Methods:**
 
-1. **Use CUDA** - GPU is much faster than CPU
-   ```python
-   tts = VibeVoiceStreamingTTS(model_path="...", device="cuda")
-   ```
-
-2. **Lower inference steps** for lower latency
-   ```python
-   tts = VibeVoiceStreamingTTS(model_path="...", inference_steps=5)
-   ```
-
-3. **Use real-time streaming** for lowest latency
-   ```python
-   player.play_stream(audio, realtime=True)
-   ```
-
-4. **Prebuffer for smoother playback**
-   - Real-time mode prebuffers 100ms automatically
-   - Buffered mode collects all audio first
-
----
-
-## Troubleshooting
-
-### No audio output
+#### `list_devices()` [static]
 
 ```python
-# Check available devices
 AudioPlayer.list_devices()
-
-# Try default device
-player = AudioPlayer(device_id=None)
+# Shows available speakers
 ```
 
-### CUDA out of memory
+#### `play_stream(audio_iterator, realtime=True)`
 
 ```python
-# Use CPU instead
-tts = VibeVoiceStreamingTTS(model_path="...", device="cpu")
+player.play_stream(audio, realtime=True)  # Streaming
+player.play_stream(audio, realtime=False)  # Buffered
 ```
 
-### Import errors
+---
 
-```bash
-# Reinstall VibeVoice
-pip install -e /path/to/VibeVoice
+## Quick Reference
 
-# Install sounddevice for audio playback
-pip install sounddevice
-```
+| Function | Purpose |
+|----------|---------|
+| `synthesize_speech()` | One-line TTS |
+| `list_default_voices()` | See available voices |
+| `VibeVoiceStreamingTTS` | Advanced TTS class |
+| `AudioPlayer` | Audio playback |
 
-### Distorted audio
+**Devices:**
+- `"cuda"` - NVIDIA GPU (fastest)
+- `"mps"` - Apple Silicon
+- `"cpu"` - CPU (slower)
 
-Audio is automatically normalized and clipped to [-1.0, 1.0] range. If you still hear distortion, try:
+**Quality Settings:**
+- Fast: `inference_steps=5`, `cfg_scale=1.5`
+- Quality: `inference_steps=50`, `cfg_scale=2.0`
 
-```python
-# Lower CFG scale
-audio = tts.text_to_speech_streaming(text_gen(), cfg_scale=1.0)
-```
+**Default Voices:**
+- en-Mike_man, en-Emma_woman, en-Carter_man, en-Davis_man, en-Frank_man, en-Grace_woman, in-Samuel_man
 
 ---
 
 ## License
 
-See the [LICENSE](../LICENSE) file for details.
+See [LICENSE](../LICENSE) for details.