From 5c14124f1c00ca77a94fcede810ccd785d0a3421 Mon Sep 17 00:00:00 2001 From: mitcheb0219 Date: Sun, 11 Jan 2026 21:54:46 -0500 Subject: [PATCH 01/13] Major Refactor - Audio Streaming Across All Backends --- src/TextToTalk.Lexicons/LexiconManager.cs | 1 + src/TextToTalk.Tests/packages.lock.json | 136 ++++--- src/TextToTalk/Backends/Azure/AzureClient.cs | 47 ++- src/TextToTalk/Backends/BackendUI.cs | 35 ++ .../Backends/ElevenLabs/ElevenLabsBackend.cs | 10 + .../ElevenLabs/ElevenLabsBackendUIModel.cs | 4 +- .../Backends/ElevenLabs/ElevenLabsClient.cs | 120 ++++--- .../GoogleCloud/GoogleCloudBackend.cs | 16 +- .../GoogleCloud/GoogleCloudBackendUI.cs | 19 +- .../Backends/GoogleCloud/GoogleCloudClient.cs | 107 ++++-- .../GoogleCloud/GoogleCloudVoicePreset.cs | 11 +- .../Backends/Kokoro/KokoroSoundQueue.cs | 164 ++++++--- .../Backends/OpenAI/OpenAiBackend.cs | 40 ++- .../Backends/OpenAI/OpenAiBackendUI.cs | 40 ++- .../Backends/OpenAI/OpenAiBackendUIModel.cs | 79 +++-- .../Backends/OpenAI/OpenAiClient.cs | 292 ++++----------- .../Backends/OpenAI/OpenAiVoicePreset.cs | 25 +- src/TextToTalk/Backends/Polly/PollyClient.cs | 45 ++- src/TextToTalk/Backends/SoundQueue.cs | 3 +- src/TextToTalk/Backends/StreamFormat.cs | 3 + src/TextToTalk/Backends/StreamSoundQueue.cs | 2 + .../Backends/StreamSoundQueueItem.cs | 36 +- .../Backends/StreamingSoundQueue.cs | 335 ++++++++++++++++++ .../Backends/System/SystemBackend.cs | 1 + .../Backends/System/SystemSoundQueue.cs | 263 ++++++++++---- .../Backends/System/SystemSoundQueueItem.cs | 7 + .../Backends/Uberduck/UberduckBackend.cs | 6 +- .../Backends/Uberduck/UberduckBackendUI.cs | 13 +- .../Backends/Uberduck/UberduckClient.cs | 163 +++++---- .../Uberduck/UberduckCredentialManager.cs | 4 +- .../Backends/Uberduck/UberduckVoice.cs | 27 +- src/TextToTalk/PluginConfiguration.cs | 1 + src/TextToTalk/TextToTalk.csproj | 8 +- src/TextToTalk/VoicePresetConfiguration.cs | 2 - src/TextToTalk/packages.lock.json | 130 ++++--- 35 files changed, 1483 insertions(+), 712 deletions(-) create mode 100644 src/TextToTalk/Backends/StreamingSoundQueue.cs diff --git a/src/TextToTalk.Lexicons/LexiconManager.cs b/src/TextToTalk.Lexicons/LexiconManager.cs index d4b58b46..3bc2d287 100644 --- a/src/TextToTalk.Lexicons/LexiconManager.cs +++ b/src/TextToTalk.Lexicons/LexiconManager.cs @@ -97,6 +97,7 @@ public string MakeSsml( bool includeSpeakAttributes = true) { + text = System.Security.SecurityElement.Escape(text); foreach (var (_, lexicon) in this.lexicons) { foreach (var lexeme in lexicon.Lexemes.Where(lexeme => text.Contains(lexeme.Grapheme))) diff --git a/src/TextToTalk.Tests/packages.lock.json b/src/TextToTalk.Tests/packages.lock.json index 7d4f6272..c0a0507f 100644 --- a/src/TextToTalk.Tests/packages.lock.json +++ b/src/TextToTalk.Tests/packages.lock.json @@ -87,119 +87,114 @@ "resolved": "14.0.1", "contentHash": "y0WWyUE6dhpGdolK3iKgwys05/nZaVf4ZPtIjpLhJBZvHxkkiE23zYRo7K7uqAgoK/QvK5cqF6l3VG5AbgC6KA==" }, - "Fare": { - "type": "Transitive", - "resolved": "2.2.1", - "contentHash": "21XZo/yuXK1k0EUhdLnjgRD4n0HQYmPFchV6uaORcRc65rasZ1vdm2dmJXPBKZiIBztRRYRmmg/B76W721VWkA==" - }, "Google.Api.CommonProtos": { "type": "Transitive", - "resolved": "2.16.0", - "contentHash": "37MuZrE9AAqHAdYgFLoTHydAiXDRriQZGVKEg6fr6ASnrY5GtauYXnQrGk5x2K3NmYzEXe+wkpaPVmxjb3NKjg==", + "resolved": "2.17.0", + "contentHash": "elfQPknFr495hm7vdy6ZlgyQh6yzZq9TU7sS35L/Fj/fqjM/mUGau9gVJLhvQEtUlPjtR80hpn/m9HvBMyCXIw==", "dependencies": { - "Google.Protobuf": "[3.28.2, 4.0.0)" + "Google.Protobuf": "[3.31.1, 4.0.0]" } }, "Google.Api.Gax": { "type": "Transitive", - "resolved": "4.9.0", - "contentHash": "fjHHYcQ99u0ztqwT537rvVtJMdDy6G2VHBZ+F1cBjDGYNVZfrpk40DMQ/OpUGToT9ZGHVirhh3eJ73bw2ANVPQ==", + "resolved": "4.12.1", + "contentHash": "G62dRNOv5DolfRviT6CCrL2a5nZ/CWWdRzhADkGnpCkYSOc3QnH5xxRvZiOKuHU8weJ/pAqAqrj7+T9IWdlu2Q==", "dependencies": { "Microsoft.Bcl.AsyncInterfaces": "6.0.0", - "Newtonsoft.Json": "13.0.3" + "Newtonsoft.Json": "13.0.4" } }, "Google.Api.Gax.Grpc": { "type": "Transitive", - "resolved": "4.9.0", - "contentHash": "ToCx/0cs+wJ9j7vzKRcPAKneJVZrz/s9JhW9QsFx1dar9WzTxawQZ8xTjyieSy8tY0UiYCL1qYkn/iRrklYnSA==", + "resolved": "4.12.1", + "contentHash": "W3LjuitOWxWyvbwqeHvpgp0LdshEiTnw/pneDAfAhQ02VgU2gVEzSXfGNPsvL8hDPBXjngR/fWNme8Kungwwkw==", "dependencies": { - "Google.Api.CommonProtos": "2.16.0", - "Google.Api.Gax": "4.9.0", - "Google.Apis.Auth": "1.68.0", - "Grpc.Auth": "2.66.0", - "Grpc.Core.Api": "2.66.0", - "Grpc.Net.Client": "2.66.0", + "Google.Api.CommonProtos": "2.17.0", + "Google.Api.Gax": "4.12.1", + "Google.Apis.Auth": "1.72.0", + "Grpc.Auth": "[2.71.0, 3.0.0)", + "Grpc.Core.Api": "[2.71.0, 3.0.0)", + "Grpc.Net.Client": "[2.71.0, 3.0.0)", "Microsoft.Extensions.DependencyInjection.Abstractions": "6.0.0" } }, "Google.Apis": { "type": "Transitive", - "resolved": "1.68.0", - "contentHash": "s2MymhdpH+ybZNBeZ2J5uFgFHApBp+QXf9FjZSdM1lk/vx5VqIknJwnaWiuAzXxPrLEkesX0Q+UsiWn39yZ9zw==", + "resolved": "1.72.0", + "contentHash": "QbSJ08W7QuqsfzDPOZDHl1aFzCYwMcfBoHqQRh7koglwDN5WacShCKYMpU/zR1Pf3h3sH6JTGEeM/txAxaJuEg==", "dependencies": { - "Google.Apis.Core": "1.68.0" + "Google.Apis.Core": "1.72.0" } }, "Google.Apis.Auth": { "type": "Transitive", - "resolved": "1.68.0", - "contentHash": "hFx8Qz5bZ4w0hpnn4tSmZaaFpjAMsgVElZ+ZgVLUZ2r9i+AKcoVgwiNfv1pruNS5cCvpXqhKECbruBCfRezPHA==", + "resolved": "1.72.0", + "contentHash": "RBoFwFKBHKUjuyJf2weEnqICQLaY0TdIrdFv2yC8bsiR2VFYxizOn3C/qN1FWCCb0Uh9GhW+zwAV1yUxPjiocw==", "dependencies": { - "Google.Apis": "1.68.0", - "Google.Apis.Core": "1.68.0", + "Google.Apis": "1.72.0", + "Google.Apis.Core": "1.72.0", "System.Management": "7.0.2" } }, "Google.Apis.Core": { "type": "Transitive", - "resolved": "1.68.0", - "contentHash": "pAqwa6pfu53UXCR2b7A/PAPXeuVg6L1OFw38WckN27NU2+mf+KTjoEg2YGv/f0UyKxzz7DxF1urOTKg/6dTP9g==", + "resolved": "1.72.0", + "contentHash": "ZmYX1PU0vTKFT42c7gp4zaYcb/0TFAXrt9qw8yEz0wjvaug85+/WddlPTfT525Qei8iIUsF6t4bHYrsb2O7crg==", "dependencies": { - "Newtonsoft.Json": "13.0.3" + "Newtonsoft.Json": "13.0.4" } }, "Google.Cloud.TextToSpeech.V1": { "type": "Transitive", - "resolved": "3.9.0", - "contentHash": "JpejhPzzEQ6rdaf0nsjjJwj1CJb8Zs0x+TH27+A17KF2g0NqrgtAbpkUZTiGlQHhOzJSF1lB3amrQhbGjozJ3A==", + "resolved": "3.17.0", + "contentHash": "27vM1NEBmCqAwqagwS0aEHfRBrFy7z6Ef+BblwKMaxtUUY0amdUdeXLY/PU8RSIHtJoan1K6ZKIS6YYqzgp77g==", "dependencies": { - "Google.Api.Gax.Grpc": "[4.9.0, 5.0.0)", - "Google.LongRunning": "[3.3.0, 4.0.0)" + "Google.Api.Gax.Grpc": "[4.12.1, 5.0.0)", + "Google.LongRunning": "[3.5.0, 4.0.0)" } }, "Google.LongRunning": { "type": "Transitive", - "resolved": "3.3.0", - "contentHash": "F2SZ83Jo466Wj/s1Z7QhIAmWBXxJZQyXZpcx0P8BR7d6s0FAj67vQjeUPESSJcvsy8AqYiYBhkUr2YpZhTQeHg==", + "resolved": "3.5.0", + "contentHash": "W8xO6FA+rG8WjKOsyIjTKjeKLcyCrjBBYeEdZ4QBkKQcxmRczbrfKhKQmdorb2V35CqXeeTbue5Na6Zkgyv8ow==", "dependencies": { - "Google.Api.Gax.Grpc": "[4.8.0, 5.0.0)" + "Google.Api.Gax.Grpc": "[4.12.1, 5.0.0)" } }, "Google.Protobuf": { "type": "Transitive", - "resolved": "3.28.2", - "contentHash": "Z86ZKAB+v1B/m0LTM+EVamvZlYw/g3VND3/Gs4M/+aDIxa2JE9YPKjDxTpf0gv2sh26hrve3eI03brxBmzn92g==" + "resolved": "3.31.1", + "contentHash": "gSnJbUmGiOTdWddPhqzrEscHq9Ls6sqRDPB9WptckyjTUyx70JOOAaDLkFff8gManZNN3hllQ4aQInnQyq/Z/A==" }, "Grpc.Auth": { "type": "Transitive", - "resolved": "2.66.0", - "contentHash": "FRQlhMAcHf0GjAXIfhN6RydfZncLLXNNTOtpLL1bt57kp59vu40faW+dr6Vwl7ef/IUFfF38aiB5jvhAA/9Aow==", + "resolved": "2.71.0", + "contentHash": "t2aGh/pMgqmc3GimtYfC7VcgVY/VSbk6SLH+61wewsgK45tzxxD9nYYItT5bpLn7fbebirmHXfgJcVKIArd0cg==", "dependencies": { - "Google.Apis.Auth": "1.68.0", - "Grpc.Core.Api": "2.66.0" + "Google.Apis.Auth": "1.69.0", + "Grpc.Core.Api": "2.71.0" } }, "Grpc.Core.Api": { "type": "Transitive", - "resolved": "2.66.0", - "contentHash": "HsjsQVAHe4hqP4t4rpUnmq+MZvPdyrlPsWF4T5fbMvyP3o/lMV+KVJfDlaNH8+v0aGQTVT3EsDFufbhaWb52cw==" + "resolved": "2.71.0", + "contentHash": "QquqUC37yxsDzd1QaDRsH2+uuznWPTS8CVE2Yzwl3CvU4geTNkolQXoVN812M2IwT6zpv3jsZRc9ExJFNFslTg==" }, "Grpc.Net.Client": { "type": "Transitive", - "resolved": "2.66.0", - "contentHash": "GwkSsssXFgN9+M2U+UQWdErf61sn1iqgP+2NRBlDXATcP9vlxda0wySxd/eIL8U522+SnyFNUXlvQ5tAzGk9cA==", + "resolved": "2.71.0", + "contentHash": "U1vr20r5ngoT9nlb7wejF28EKN+taMhJsV9XtK9MkiepTZwnKxxiarriiMfCHuDAfPUm9XUjFMn/RIuJ4YY61w==", "dependencies": { - "Grpc.Net.Common": "2.66.0", + "Grpc.Net.Common": "2.71.0", "Microsoft.Extensions.Logging.Abstractions": "6.0.0" } }, "Grpc.Net.Common": { "type": "Transitive", - "resolved": "2.66.0", - "contentHash": "YJpQpIvpo0HKlsG6SHwaieyji08qfv0DdEDIewCAA0egQY08637sHOj1netLGUhzBEsCqlGC3e92TZ2uqhxnvw==", + "resolved": "2.71.0", + "contentHash": "v0c8R97TwRYwNXlC8GyRXwYTCNufpDfUtj9la+wUrZFzVWkFJuNAltU+c0yI3zu0jl54k7en6u2WKgZgd57r2Q==", "dependencies": { - "Grpc.Core.Api": "2.66.0" + "Grpc.Core.Api": "2.71.0" } }, "KokoroSharp": { @@ -245,13 +240,16 @@ }, "Microsoft.Extensions.DependencyInjection.Abstractions": { "type": "Transitive", - "resolved": "6.0.0", - "contentHash": "xlzi2IYREJH3/m6+lUrQlujzX8wDitm4QGnUu6kUXTQAWPuZY8i+ticFJbzfqaetLA6KR/rO6Ew/HuYD+bxifg==" + "resolved": "8.0.2", + "contentHash": "3iE7UF7MQkCv1cxzCahz+Y/guQbTqieyxyaWKhrRO91itI9cOKO76OHeQDahqG4MmW5umr3CcCvGmK92lWNlbg==" }, "Microsoft.Extensions.Logging.Abstractions": { "type": "Transitive", - "resolved": "6.0.0", - "contentHash": "/HggWBbTwy8TgebGSX5DBZ24ndhzi93sHUBDvP1IxbZD7FDokYzdAr6+vbWGjw2XAfR2EJ1sfKUotpjHnFWPxA==" + "resolved": "8.0.3", + "contentHash": "dL0QGToTxggRLMYY4ZYX5AMwBb+byQBd/5dMiZE07Nv73o6I5Are3C7eQTh7K2+A4ct0PVISSr7TZANbiNb2yQ==", + "dependencies": { + "Microsoft.Extensions.DependencyInjection.Abstractions": "8.0.2" + } }, "Microsoft.ML.OnnxRuntime": { "type": "Transitive", @@ -339,14 +337,22 @@ }, "Newtonsoft.Json": { "type": "Transitive", - "resolved": "13.0.3", - "contentHash": "HrC5BXdl00IP9zeV+0Z848QWPAoCr9P3bDEZguI+gkLcBKAOxix/tLEAAHC+UvDNPv4a2d18lOReHMOagPa+zQ==" + "resolved": "13.0.4", + "contentHash": "pdgNNMai3zv51W5aq268sujXUyx7SNdE2bj1wZcWjAQrKMFZV260lbqYop1d2GM67JI1huLRwxo9ZqnfF/lC6A==" }, "NumSharp": { "type": "Transitive", "resolved": "0.30.0", "contentHash": "1f8m2B/m/ZSsICaqLszspCyA9/sTHK7wBKEH5KsxGg/r3QCYTc2HnfYOGMeCytvo8/j0v/umn5umLOLhdExlFA==" }, + "OpenAI": { + "type": "Transitive", + "resolved": "2.8.0", + "contentHash": "KcYpZ9IhuxFD2hGAJlL5vABtkr00CjeJU0SY8CjZQyzvzkzLop8jhdX3iDvteVJg6e3y4TEiY+Kti4gDJAagnA==", + "dependencies": { + "System.ClientModel": "1.8.1" + } + }, "OpenTK.Audio.OpenAL": { "type": "Transitive", "resolved": "5.0.0-pre.13", @@ -376,6 +382,15 @@ "resolved": "4.0.5", "contentHash": "2QC9zDPFT/SOnP7iFdK3AwakEcJ7D3zDoU7IwIAOyEhY4WQ2GQBvLqZ29/R1BSujPNtGHMITmVW1d+VjvLg6lg==" }, + "System.ClientModel": { + "type": "Transitive", + "resolved": "1.8.1", + "contentHash": "4oUQgw/vaO4FBOk3YsH40hbrjxRED1l95rRLvTMtHXfQxapXya9IfPpm/KgwValFFtYTfYGFOs/qzGmGyexicQ==", + "dependencies": { + "Microsoft.Extensions.Logging.Abstractions": "8.0.3", + "System.Memory.Data": "8.0.1" + } + }, "System.CodeDom": { "type": "Transitive", "resolved": "7.0.0", @@ -402,6 +417,11 @@ "System.CodeDom": "7.0.0" } }, + "System.Memory.Data": { + "type": "Transitive", + "resolved": "8.0.1", + "contentHash": "BVYuec3jV23EMRDeR7Dr1/qhx7369dZzJ9IWy2xylvb4YfXsrUxspWc4UWYid/tj4zZK58uGZqn2WQiaDMhmAg==" + }, "System.Numerics.Tensors": { "type": "Transitive", "resolved": "9.0.5", @@ -463,11 +483,11 @@ "AWSSDK.Polly": "[3.7.401.37, )", "AdysTech.CredentialManager": "[2.6.0, )", "DalamudPackager": "[14.0.1, )", - "Fare": "[2.2.1, )", - "Google.Cloud.TextToSpeech.V1": "[3.9.0, )", + "Google.Cloud.TextToSpeech.V1": "[3.17.0, )", "KokoroSharp.CPU": "[0.6.1, )", "Microsoft.CognitiveServices.Speech": "[1.41.1, )", "NAudio": "[2.2.1, )", + "OpenAI": "[2.8.0, )", "R3": "[1.2.9, )", "Standart.Hash.xxHash": "[4.0.5, )", "System.Drawing.Common": "[9.0.0, )", diff --git a/src/TextToTalk/Backends/Azure/AzureClient.cs b/src/TextToTalk/Backends/Azure/AzureClient.cs index 14cf7a4f..cbb933be 100644 --- a/src/TextToTalk/Backends/Azure/AzureClient.cs +++ b/src/TextToTalk/Backends/Azure/AzureClient.cs @@ -1,10 +1,12 @@ -using System; +using Microsoft.CognitiveServices.Speech; +using Microsoft.CognitiveServices.Speech.Audio; +using Serilog; +using System; using System.Collections.Generic; using System.IO; using System.Linq; +using System.Threading; using System.Threading.Tasks; -using Microsoft.CognitiveServices.Speech; -using Microsoft.CognitiveServices.Speech.Audio; using TextToTalk.Lexicons; namespace TextToTalk.Backends.Azure; @@ -13,16 +15,17 @@ public class AzureClient : IDisposable { private readonly SpeechConfig speechConfig; private readonly SpeechSynthesizer synthesizer; - private readonly StreamSoundQueue soundQueue; + private readonly StreamingSoundQueue soundQueue; private readonly LexiconManager lexiconManager; private readonly PluginConfiguration config; + private CancellationTokenSource? _ttsCts; public AzureClient(string subscriptionKey, string region, LexiconManager lexiconManager, PluginConfiguration config) { var audioConfig = AudioConfig.FromWavFileOutput("NUL"); this.speechConfig = SpeechConfig.FromSubscription(subscriptionKey, region); this.synthesizer = new SpeechSynthesizer(speechConfig, audioConfig); - this.soundQueue = new StreamSoundQueue(config); + this.soundQueue = new StreamingSoundQueue(config); this.lexiconManager = lexiconManager; } @@ -61,6 +64,10 @@ public List GetVoices() public async Task Say(string? voice, int playbackRate, float volume, TextSource source, string text, string style) { + _ttsCts?.Cancel(); + _ttsCts = new CancellationTokenSource(); + var token = _ttsCts.Token; + var ssml = this.lexiconManager.MakeSsml( text, style, @@ -68,26 +75,41 @@ public async Task Say(string? voice, int playbackRate, float volume, TextSource langCode: "en-US", playbackRate: playbackRate, includeSpeakAttributes: true); - DetailedLog.Verbose(ssml); - var res = await this.synthesizer.SpeakSsmlAsync(ssml); + // LOW LATENCY PATH: Start speaking and stream chunks immediately + speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm); + using var result = await this.synthesizer.StartSpeakingSsmlAsync(ssml); + using var audioDataStream = AudioDataStream.FromResult(result); - HandleResult(res); + byte[] buffer = new byte[4096]; + uint bytesRead; + while ((bytesRead = audioDataStream.ReadData(buffer)) > 0) + { + if (token.IsCancellationRequested) break; + // Create a copy of the buffer for the specific chunk + var chunk = new byte[bytesRead]; + Buffer.BlockCopy(buffer, 0, chunk, 0, (int)bytesRead); - var soundStream = new MemoryStream(res.AudioData); - soundStream.Seek(0, SeekOrigin.Begin); + var chunkStream = new MemoryStream(chunk); + this.soundQueue.EnqueueSound(chunkStream, source, volume, StreamFormat.Azure, null); + } + + } - this.soundQueue.EnqueueSound(soundStream, source, StreamFormat.Wave, volume); - } public Task CancelAllSounds() { + //this.synthesizer.Dispose(); + this.soundQueue.CancelAllSounds(); + this.soundQueue.StopHardware(); this.soundQueue.CancelAllSounds(); return Task.CompletedTask; } public Task CancelFromSource(TextSource source) { + this.synthesizer.StopSpeakingAsync(); + this.soundQueue.StopHardware(); this.soundQueue.CancelFromSource(source); return Task.CompletedTask; } @@ -127,5 +149,6 @@ public void Dispose() { this.synthesizer?.Dispose(); this.soundQueue?.Dispose(); + this.soundQueue?.Dispose(); } } \ No newline at end of file diff --git a/src/TextToTalk/Backends/BackendUI.cs b/src/TextToTalk/Backends/BackendUI.cs index 05249a83..d206904f 100644 --- a/src/TextToTalk/Backends/BackendUI.cs +++ b/src/TextToTalk/Backends/BackendUI.cs @@ -2,6 +2,7 @@ using System.Linq; using Dalamud.Bindings.ImGui; using TextToTalk.UI; +using TextToTalk.UI.Windows; namespace TextToTalk.Backends; @@ -143,4 +144,38 @@ public static bool ImGuiPresetCombo(string label, SortedSet selectedPresets ImGui.EndCombo(); return didPresetsChange; } + public static bool ImGuiStylesCombo(string label, string previewText, SortedSet selectedIndices, List styles) + { + // Use the passed-in string, or a placeholder if it's empty + string displayValue = !string.IsNullOrEmpty(previewText) ? previewText : "None selected"; + + bool didChange = false; + + // The second parameter of BeginCombo controls what is shown in the closed box + if (ImGui.BeginCombo(label, displayValue)) + { + for (int i = 0; i < styles.Count; i++) + { + bool isSelected = selectedIndices.Contains(i); + + // Use Selectable with DontClosePopups for multi-select + if (ImGui.Selectable(styles[i], isSelected, ImGuiSelectableFlags.DontClosePopups)) + { + if (!isSelected) + selectedIndices.Add(i); + else + selectedIndices.Remove(i); + + didChange = true; + } + + if (isSelected) + ImGui.SetItemDefaultFocus(); + } + + ImGui.EndCombo(); + } + + return didChange; + } } \ No newline at end of file diff --git a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackend.cs b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackend.cs index d9dfe697..ef00d2f4 100644 --- a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackend.cs +++ b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackend.cs @@ -67,11 +67,21 @@ await this.uiModel.ElevenLabs.Say(elevenLabsVoicePreset.VoiceId, elevenLabsVoice public override void CancelAllSpeech() { this.uiModel.SoundQueue.CancelAllSounds(); + if (this.uiModel.ElevenLabs._TtsCts != null) + { + this.uiModel.ElevenLabs._TtsCts.Cancel(); + } + this.uiModel.SoundQueue.StopHardware(); } public override void CancelSay(TextSource source) { this.uiModel.SoundQueue.CancelFromSource(source); + if (this.uiModel.ElevenLabs._TtsCts != null) + { + this.uiModel.ElevenLabs._TtsCts.Cancel(); + } + this.uiModel.SoundQueue.StopHardware(); } public override void DrawSettings(IConfigUIDelegates helpers) diff --git a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUIModel.cs b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUIModel.cs index e2b002db..38368c32 100644 --- a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUIModel.cs +++ b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUIModel.cs @@ -21,7 +21,7 @@ public class ElevenLabsBackendUIModel : IDisposable /// /// Gets the sound playback queue. /// - public StreamSoundQueue SoundQueue { get; } + public StreamingSoundQueue SoundQueue { get; } /// /// Gets the currently-instantiated ElevenLabs client instance. @@ -46,7 +46,7 @@ public class ElevenLabsBackendUIModel : IDisposable public IReadOnlyDictionary Items, Dictionary? Rates)> Models { get; private set; } public ElevenLabsBackendUIModel(PluginConfiguration config, HttpClient http) { - SoundQueue = new StreamSoundQueue(config); + SoundQueue = new StreamingSoundQueue(config); ElevenLabs = new ElevenLabsClient(SoundQueue, http); this.config = config; this.getUserSubscriptionInfoImmediately = new ReactiveProperty(0); diff --git a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsClient.cs b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsClient.cs index 1c27094c..e65b25a3 100644 --- a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsClient.cs +++ b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsClient.cs @@ -1,4 +1,6 @@ -using System; +using Newtonsoft.Json; +using Serilog; +using System; using System.Collections.Generic; using System.Collections.Immutable; using System.IO; @@ -6,9 +8,9 @@ using System.Net; using System.Net.Http; using System.Net.Http.Headers; +using System.Text; +using System.Threading; using System.Threading.Tasks; -using Newtonsoft.Json; -using Serilog; namespace TextToTalk.Backends.ElevenLabs; @@ -17,11 +19,13 @@ public class ElevenLabsClient private const string UrlBase = "https://api.elevenlabs.io"; private readonly HttpClient http; - private readonly StreamSoundQueue soundQueue; + private readonly StreamingSoundQueue soundQueue; public string? ApiKey { get; set; } - public ElevenLabsClient(StreamSoundQueue soundQueue, HttpClient http) + public CancellationTokenSource? _TtsCts; + + public ElevenLabsClient(StreamingSoundQueue soundQueue, HttpClient http) { this.http = http; this.soundQueue = soundQueue; @@ -30,57 +34,75 @@ public ElevenLabsClient(StreamSoundQueue soundQueue, HttpClient http) public async Task Say(string? voice, int playbackRate, float volume, float similarityBoost, float stability, TextSource source, string text, string? model, string? style) { - if (!IsAuthorizationSet()) - { - throw new ElevenLabsMissingCredentialsException("No ElevenLabs authorization keys have been configured."); - } - Log.Information($"Style String = {style}"); - if (style != "") - { - model = "eleven_v3"; //force eleven_v3 model for styles - text = $"[{style}] " + text; //append style tag to text - } - float finalStability = stability; - if (model == "eleven_v3") // eleven_v3 only supports stability float values 0.0, 0.5, 1.0 - { - finalStability = (float)Math.Round(stability * 2.0f, MidpointRounding.AwayFromZero) / 2.0f; - } - Log.Information($"Message String = {text}"); - Log.Information($"Model String = {model}"); - var args = new ElevenLabsTextToSpeechRequest + Log.Information($"Style = {style}"); + _TtsCts?.Cancel(); + _TtsCts?.Dispose(); + + _TtsCts = new CancellationTokenSource(); + var ct = _TtsCts.Token; + + try { - Text = text, - ModelId = model, - VoiceSettings = new ElevenLabsVoiceSettings + if (!IsAuthorizationSet()) { - SimilarityBoost = similarityBoost, - Stability = finalStability, - }, - }; - - // Make the request - var uriBuilder = new UriBuilder(UrlBase) { Path = $"/v1/text-to-speech/{voice}/stream" }; - using var req = new HttpRequestMessage(HttpMethod.Post, uriBuilder.Uri); - AddAuthorization(req); - req.Headers.Add("accept", "audio/mpeg"); + throw new ElevenLabsMissingCredentialsException("No ElevenLabs authorization keys have been configured."); + } - DetailedLog.Verbose(JsonConvert.SerializeObject(args)); - using var content = new StringContent(JsonConvert.SerializeObject(args)); - content.Headers.ContentType = new MediaTypeHeaderValue("application/json"); - req.Content = content; - - var res = await this.http.SendAsync(req); - EnsureSuccessStatusCode(res); + if (!string.IsNullOrEmpty(style)) + { + model = "eleven_v3"; + text = $"[{style}] " + text; + } - // Copy the sound to a new buffer and enqueue it - var responseStream = await res.Content.ReadAsStreamAsync(); - var mp3Stream = new MemoryStream(); - await responseStream.CopyToAsync(mp3Stream); - mp3Stream.Seek(0, SeekOrigin.Begin); + float finalStability = stability; + if (model == "eleven_v3") + { + finalStability = (float)Math.Round(stability * 2.0f, MidpointRounding.AwayFromZero) / 2.0f; + } - this.soundQueue.EnqueueSound(mp3Stream, source, StreamFormat.Mp3, volume); + var args = new ElevenLabsTextToSpeechRequest + { + Text = text, + ModelId = model, + VoiceSettings = new ElevenLabsVoiceSettings + { + SimilarityBoost = similarityBoost, + Stability = finalStability, + }, + }; + Log.Information($"Model Called = {args.ModelId}"); + Log.Information($"Message Sent = {args.Text}"); + + var uriBuilder = new UriBuilder(UrlBase) { Path = $"/v1/text-to-speech/{voice}/stream" }; + + // Use HttpCompletionOption.ResponseHeadersRead to begin processing before the body is fully downloaded + using var req = new HttpRequestMessage(HttpMethod.Post, uriBuilder.Uri); + AddAuthorization(req); + req.Headers.Add("accept", "audio/mpeg"); + + using var content = new StringContent(JsonConvert.SerializeObject(args), Encoding.UTF8, "application/json"); + req.Content = content; + + // SendAsync with ResponseHeadersRead is the key for streaming + var res = await this.http.SendAsync(req, HttpCompletionOption.ResponseHeadersRead, ct); + EnsureSuccessStatusCode(res); + + // Get the stream directly from the response + var responseStream = await res.Content.ReadAsStreamAsync(ct); + + // Enqueue the live stream. + // IMPORTANT: Your soundQueue must be able to process the stream as bytes arrive. + this.soundQueue.EnqueueSound(responseStream, source, volume, StreamFormat.Mp3, res); + } + catch (OperationCanceledException) + { + // 2026 Best Practice: Catch the cancellation exception to prevent it + // from bubbling up as a generic error. + Log.Information("TTS generation was cancelled."); + } } + public async Task GetUserSubscriptionInfo() { if (!IsAuthorizationSet()) diff --git a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackend.cs b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackend.cs index fae42c1c..898ad597 100644 --- a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackend.cs +++ b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackend.cs @@ -6,12 +6,12 @@ namespace TextToTalk.Backends.GoogleCloud; public class GoogleCloudBackend : VoiceBackend { private readonly GoogleCloudClient client; - private readonly StreamSoundQueue soundQueue; + private readonly StreamingSoundQueue soundQueue; private readonly GoogleCloudBackendUI ui; public GoogleCloudBackend(PluginConfiguration config) { - soundQueue = new StreamSoundQueue(config); + soundQueue = new StreamingSoundQueue(config); client = new GoogleCloudClient(soundQueue, config.GoogleCreds); ui = new GoogleCloudBackendUI(config, client, this); } @@ -25,18 +25,28 @@ public override void Say(SayRequest request) if (request.Voice is not GoogleCloudVoicePreset voicePreset) throw new InvalidOperationException("Invalid voice preset provided."); - _ = client.Say(voicePreset.Locale, voicePreset.VoiceName, voicePreset.SampleRate, voicePreset.Pitch, voicePreset.PlaybackRate, voicePreset.Volume, request.Source, + _ = client.Say(voicePreset.Locale, voicePreset.VoiceName, voicePreset.PlaybackRate, voicePreset.Volume, request.Source, request.Text); } public override void CancelAllSpeech() { soundQueue.CancelAllSounds(); + if (client._TtsCts != null) + { + client._TtsCts?.Cancel(); + } + soundQueue.StopHardware(); } public override void CancelSay(TextSource source) { soundQueue.CancelFromSource(source); + if (client._TtsCts != null) + { + client._TtsCts?.Cancel(); + } + soundQueue.StopHardware(); } public override void DrawSettings(IConfigUIDelegates helpers) diff --git a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackendUI.cs b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackendUI.cs index f08b399d..38ed9a65 100644 --- a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackendUI.cs +++ b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackendUI.cs @@ -102,25 +102,8 @@ public void DrawVoicePresetOptions() ImGui.EndCombo(); } - var validSampleRates = new[] { "8000", "16000", "22050", "24000" }; - var sampleRate = currentVoicePreset.SampleRate.ToString(); - var sampleRateIndex = Array.IndexOf(validSampleRates, sampleRate); - if (ImGui.Combo($"Sample rate##{MemoizedId.Create()}", ref sampleRateIndex, validSampleRates, - validSampleRates.Length)) - { - currentVoicePreset.SampleRate = int.Parse(validSampleRates[sampleRateIndex]); - this.config.Save(); - } - - var pitch = currentVoicePreset.Pitch ?? 0; - if (ImGui.SliderFloat($"Pitch##{MemoizedId.Create()}", ref pitch, -10f, 10f, "%.2fx")) - { - currentVoicePreset.Pitch = pitch; - config.Save(); - } - var playbackRate = currentVoicePreset.PlaybackRate ?? 1; - if (ImGui.SliderFloat($"Playback rate##{MemoizedId.Create()}", ref playbackRate, 0.25f, 4f, "%.2fx")) + if (ImGui.SliderFloat($"Playback rate##{MemoizedId.Create()}", ref playbackRate, 0.25f, 2f, "%.2fx")) { currentVoicePreset.PlaybackRate = playbackRate; config.Save(); diff --git a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs index e231d78d..d683bc68 100644 --- a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs +++ b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs @@ -1,9 +1,10 @@ +using Google.Cloud.TextToSpeech.V1; using System; using System.Collections.Generic; using System.IO; using System.Linq; +using System.Threading; using System.Threading.Tasks; -using Google.Cloud.TextToSpeech.V1; using WebSocketSharp; namespace TextToTalk.Backends.GoogleCloud; @@ -11,11 +12,13 @@ namespace TextToTalk.Backends.GoogleCloud; public class GoogleCloudClient { private TextToSpeechClient? client; - private readonly StreamSoundQueue? soundQueue; + private readonly StreamingSoundQueue? soundQueue; public Dictionary? Voices; public List? Locales; - public GoogleCloudClient(StreamSoundQueue soundQueue, string pathToCredential) + public CancellationTokenSource? _TtsCts; + + public GoogleCloudClient(StreamingSoundQueue soundQueue, string pathToCredential) { this.soundQueue = soundQueue; if (pathToCredential.IsNullOrEmpty()) return; @@ -33,17 +36,24 @@ public void Init(string pathToCredential) public Dictionary? GetGoogleTextToSpeechVoices() { if (client == null) return new Dictionary(); + + // Fetch all available voices var response = client.ListVoices(""); var fetchedVoices = new Dictionary(); foreach (var voice in response.Voices) { - fetchedVoices.Add(voice.Name, new + // Filter: Only include voices with "Chirp3" or "Chirp-HD" in their name + // Rebranded "Journey" voices also now fall under "Chirp-HD" + if (voice.Name.Contains("Chirp3") || voice.Name.Contains("Chirp-HD")) { - Name = voice.Name, - Gender = voice.SsmlGender, - }); + fetchedVoices.Add(voice.Name, new + { + Name = voice.Name, + Gender = voice.SsmlGender, + }); + } } return fetchedVoices; @@ -65,37 +75,76 @@ public List ExtractUniqueLocales(List? voicesList) return uniqueLocales.ToList().OrderBy(lang => lang).ToList(); } - public async Task Say(string? locale, string? voice, int? sampleRate, float? pitch, float? speed, float volume, TextSource source, - string text) + public async Task Say(string? locale, string? voice, float? speed, float volume, TextSource source, string text) { - if (client == null || soundQueue == null || locale == null) + if (client == null || soundQueue == null || locale == null) return; + + if (_TtsCts != null) { - return; + _TtsCts?.Cancel(); + _TtsCts?.Dispose(); } - var request = new SynthesizeSpeechRequest + _TtsCts = new CancellationTokenSource(); + var ct = _TtsCts.Token; + + try { - Input = new SynthesisInput { Text = text }, - Voice = new VoiceSelectionParams + // 1. Open the stream with the cancellation token + using var streamingCall = client.StreamingSynthesize(); + + // 2. FIRST request: Configuration ONLY + var configRequest = new StreamingSynthesizeRequest { - LanguageCode = locale, - Name = voice ?? "en-US-Wavenet-A" - }, - AudioConfig = new AudioConfig + StreamingConfig = new StreamingSynthesizeConfig + { + Voice = new VoiceSelectionParams + { + LanguageCode = locale, + Name = voice ?? "en-US-Chirp3-HD-Puff-A" + }, + StreamingAudioConfig = new StreamingAudioConfig + { + // Linear16 is the 2026 standard for Chirp 3 HD PCM streaming + AudioEncoding = AudioEncoding.Pcm, + SampleRateHertz = 24000, + SpeakingRate = speed ?? 1.0f, + } + } + }; + + // Pass token to WriteAsync to stop sending if cancelled + await streamingCall.WriteAsync(configRequest); + + // 3. SECOND request: Input Text ONLY + await streamingCall.WriteAsync(new StreamingSynthesizeRequest { - AudioEncoding = AudioEncoding.Mp3, - SampleRateHertz = sampleRate ?? 22050, - Pitch = pitch ?? 0, - SpeakingRate = speed ?? 1.0f, - VolumeGainDb = volume - } - }; + Input = new StreamingSynthesisInput { Text = text } + }); - var response = await client.SynthesizeSpeechAsync(request); + await streamingCall.WriteCompleteAsync(); - MemoryStream mp3Stream = new MemoryStream(response.AudioContent.ToByteArray()); - mp3Stream.Seek(0, SeekOrigin.Begin); + // 4. Process the response stream with the cancellation token + // Use WithCancellation to properly dispose of the enumerator on cancel + await foreach (var response in streamingCall.GetResponseStream().WithCancellation(ct)) + { + if (response.AudioContent.Length > 0) + { + var chunkStream = new MemoryStream(response.AudioContent.ToByteArray()); - soundQueue.EnqueueSound(mp3Stream, source, StreamFormat.Mp3, volume); + // Note: Linear16 audio is typically handled as StreamFormat.Pcm + // but matches Wave if your queue expects raw headerless bytes. + soundQueue.EnqueueSound(chunkStream, source, volume, StreamFormat.Wave, null); + } + } + } + catch (OperationCanceledException) + { + // Handle normal cancellation (e.g., stopping the voice) + } + catch (Grpc.Core.RpcException ex) when (ex.StatusCode == Grpc.Core.StatusCode.Cancelled) + { + // Handle gRPC specific cancellation + } } } \ No newline at end of file diff --git a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudVoicePreset.cs b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudVoicePreset.cs index fe8d5977..221a3c54 100644 --- a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudVoicePreset.cs +++ b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudVoicePreset.cs @@ -4,14 +4,9 @@ namespace TextToTalk.Backends.GoogleCloud; public class GoogleCloudVoicePreset : VoicePreset { - public int? SampleRate { get; set; } - - // -20.0 - 20.0 is theoretical max, but it's lowered to work better with sliders (default 0.0) - public float? Pitch { get; set; } - public float Volume { get; set; } - // 0.25 - 4.0 (default 1.0) + // 0.25 - 2.0 (default 1.0) public float? PlaybackRate { get; set; } public string? Locale { get; set; } @@ -22,12 +17,10 @@ public class GoogleCloudVoicePreset : VoicePreset public override bool TrySetDefaultValues() { - SampleRate = 22050; - Pitch = 0.0f; Volume = 1.0f; PlaybackRate = 1.0f; Locale = "en-US"; - VoiceName = "en-US-Wavenet-D"; + VoiceName = "en-US-Chirp-HD-D"; Gender = "Male"; EnabledBackend = TTSBackend.GoogleCloud; return true; diff --git a/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs b/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs index 65d262ff..be68f757 100644 --- a/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs +++ b/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs @@ -1,5 +1,9 @@ -using System.Diagnostics.CodeAnalysis; +using NAudio.CoreAudioApi; +using NAudio.Wave; +using System; +using System.Diagnostics.CodeAnalysis; using System.IO; +using System.Threading; using System.Threading.Tasks; using Dalamud.Game; using KokoroSharp; @@ -10,16 +14,19 @@ namespace TextToTalk.Backends.Kokoro; public class KokoroSoundQueue : SoundQueue { - private readonly KokoroPlayback playback = new(); - private readonly StreamSoundQueue streamSoundQueue; + private static readonly WaveFormat WaveFormat = new(24000, 16, 1); + private readonly object soundLock = new(); private readonly PluginConfiguration config; private readonly Task modelTask; + // WASAPI Hardware Members + private WasapiOut? soundOut; + private BufferedWaveProvider? bufferedProvider; + public KokoroSoundQueue(PluginConfiguration config, Task modelTask) { this.config = config; this.modelTask = modelTask; - this.streamSoundQueue = new StreamSoundQueue(config); } private bool TryGetModel([NotNullWhen(true)] out KokoroModel? model) @@ -33,82 +40,131 @@ private bool TryGetModel([NotNullWhen(true)] out KokoroModel? model) return false; } - public void EnqueueSound(KokoroSourceQueueItem item) - { - this.AddQueueItem(item); - } - - protected override void OnSoundCancelled() - { - GetCurrentItem()?.Cancel(); - } - - public override void CancelAllSounds() - { - base.CancelAllSounds(); - streamSoundQueue.CancelAllSounds(); - } - - public override void CancelFromSource(TextSource source) - { - base.CancelFromSource(source); - streamSoundQueue.CancelFromSource(source); - } - protected override void OnSoundLoop(KokoroSourceQueueItem nextItem) { - if (!TryGetModel(out var model) || nextItem.Aborted) + if (!TryGetModel(out var model) || nextItem.Aborted) return; + + // 1. Setup WASAPI Hardware Session + lock (this.soundLock) { - return; + if (this.soundOut == null) + { + var mmDevice = GetWasapiDeviceFromGuid(config.SelectedAudioDeviceGuid); + this.bufferedProvider = new BufferedWaveProvider(WaveFormat) + { + ReadFully = false, + BufferDuration = TimeSpan.FromSeconds(30), + DiscardOnBufferOverflow = true + }; + this.soundOut = new WasapiOut(mmDevice, AudioClientShareMode.Shared, false, 50); + this.soundOut.Init(this.bufferedProvider); + } } - var lang = nextItem.Language; - - // https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md - string langCode = lang switch + // 2. Prepare Language & Tokens + string langCode = nextItem.Language switch { ClientLanguage.Japanese => "ja", - ClientLanguage.English => "en", ClientLanguage.German => "de", ClientLanguage.French => "fr", - _ => "en", + _ => config.KokoroUseAmericanEnglish ? "en-us" : "en", }; - if (langCode == "en" && config.KokoroUseAmericanEnglish) + int[] tokens = Tokenizer.Tokenize(nextItem.Text, langCode, preprocess: true); + var segments = SegmentationSystem.SplitToSegments(tokens, new() { MaxFirstSegmentLength = 200 }); + + // 3. Inference & Playback Loop + foreach (var chunk in segments) { - langCode = "en-us"; // Use American English for English language + if (nextItem.Aborted) break; + + // CPU Inference + var samples = model.Infer(chunk, nextItem.Voice.Features, nextItem.Speed); + byte[] bytes = KokoroPlayback.GetBytes(samples); + + // POST-INFERENCE ABORT CHECK: Prevent enqueuing "zombie" audio + if (nextItem.Aborted) break; + + lock (this.soundLock) + { + if (this.bufferedProvider != null && this.soundOut != null) + { + this.bufferedProvider.AddSamples(bytes, 0, bytes.Length); + if (this.soundOut.PlaybackState != PlaybackState.Playing) + { + this.soundOut.Play(); + } + } + } } - // this is a blocking call! - int[] tokens = Tokenizer.Tokenize(nextItem.Text, langCode, preprocess: true); - if (nextItem.Aborted) + // 4. Wait for audio to finish playing if not aborted + while (!nextItem.Aborted && this.bufferedProvider?.BufferedBytes > 0) { - return; + Thread.Sleep(50); } + } - var tokensList = SegmentationSystem.SplitToSegments(tokens, new() - { - MinFirstSegmentLength = 20, - MaxFirstSegmentLength = 200, - MaxSecondSegmentLength = 200 - }); // Split tokens into chunks Kokoro can handle + protected override void OnSoundCancelled() + { + // 1. Flag the current item to stop the inference loop + GetCurrentItem()?.Cancel(); + + // 2. Hard Stop the WASAPI hardware session immediately + StopHardware(); + } - foreach (var tokenChunk in tokensList) + private void StopHardware() + { + lock (this.soundLock) { - // this is a blocking call! - var samples = model.Infer(tokenChunk, nextItem.Voice.Features, nextItem.Speed); - if (nextItem.Aborted) + if (this.soundOut != null) + { + this.soundOut.Stop(); + this.soundOut.Dispose(); + this.soundOut = null; + } + if (this.bufferedProvider != null) { - return; + this.bufferedProvider.ClearBuffer(); + this.bufferedProvider = null; } + } + } - var bytes = KokoroPlayback.GetBytes(samples); - var ms = new MemoryStream(bytes); - streamSoundQueue.EnqueueSound(ms, nextItem.Source, StreamFormat.Raw, nextItem.Volume); + private MMDevice GetWasapiDeviceFromGuid(Guid targetGuid) + { + using var enumerator = new MMDeviceEnumerator(); + var devices = enumerator.EnumerateAudioEndPoints(DataFlow.Render, DeviceState.Active); + foreach (var device in devices) + { + if (device.Properties.Contains(PropertyKeys.PKEY_AudioEndpoint_GUID)) + { + var guidString = device.Properties[PropertyKeys.PKEY_AudioEndpoint_GUID].Value as string; + if (Guid.TryParse(guidString, out var deviceGuid) && deviceGuid == targetGuid) + return device; + } } + return enumerator.GetDefaultAudioEndpoint(DataFlow.Render, Role.Console); + } + + protected override void Dispose(bool disposing) + { + if (disposing) StopHardware(); + base.Dispose(disposing); + } + + public void EnqueueSound(KokoroSourceQueueItem item) + { + // Add the item to the internal SoundQueue processing loop + this.AddQueueItem(item); } } + + + + public class KokoroSourceQueueItem : SoundQueueItem { public KokoroSourceQueueItem(string text, KokoroVoice voice, float speed, float volume, TextSource source, ClientLanguage language) diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs b/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs index 8a80f2b0..1ea5a366 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs @@ -1,4 +1,5 @@ using Dalamud.Bindings.ImGui; +using OpenAI; using Serilog; using System; using System.Net; @@ -36,8 +37,8 @@ public override void Say(SayRequest request) { try { - Log.Information($"Voice name = {voicePreset.VoiceName}"); - await this.uiModel.OpenAi.Say(voicePreset, request, request.Text, !string.IsNullOrWhiteSpace(request.Style) ? request.Style : (voicePreset.Style ?? string.Empty)); + Log.Information($"Voice Style = {voicePreset.Style}"); + await this.uiModel.OpenAi.Say(request.Text, voicePreset.Model, voicePreset.VoiceName, !string.IsNullOrWhiteSpace(request.Style) ? request.Style : (voicePreset.Style ?? string.Empty), 1.0f, voicePreset.Volume); } catch (OpenAiUnauthorizedException e) { @@ -69,12 +70,42 @@ public override void Say(SayRequest request) public override void CancelAllSpeech() { + //Cancel at the queue this.uiModel.SoundQueue.CancelAllSounds(); + + //Cancel at Speech Generation + if (uiModel.OpenAi._ttsCts != null) + { + uiModel.OpenAi._ttsCts.Cancel(); + uiModel.OpenAi._ttsCts.Dispose(); + uiModel.OpenAi._ttsCts = null; + } + //Cancel at Playback + this.uiModel.SoundQueue.StopHardware(); + } public override void CancelSay(TextSource source) { + //Cancel at the queue this.uiModel.SoundQueue.CancelFromSource(source); + + //Cancel at Speech Generation + if (uiModel.OpenAi._ttsCts != null) + { + uiModel.OpenAi._ttsCts.Cancel(); + uiModel.OpenAi._ttsCts.Dispose(); + uiModel.OpenAi._ttsCts = null; + } + + //Cancel at Playback + if (uiModel.SoundQueue._ttsCts != null) + { + uiModel.OpenAi._ttsCts.Cancel(); + uiModel.OpenAi._ttsCts.Dispose(); + uiModel.OpenAi._ttsCts = null; + } + this.uiModel.SoundQueue.StopHardware(); } public override void DrawSettings(IConfigUIDelegates helpers) @@ -91,6 +122,9 @@ public override TextSource GetCurrentlySpokenTextSource() protected override void Dispose(bool disposing) { - if (disposing) this.uiModel.SoundQueue.Dispose(); + if (disposing) + { + this.uiModel.SoundQueue.Dispose(); + } } } \ No newline at end of file diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiBackendUI.cs b/src/TextToTalk/Backends/OpenAI/OpenAiBackendUI.cs index dd2b4136..6f41588c 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiBackendUI.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiBackendUI.cs @@ -2,6 +2,7 @@ using Dalamud.Game; using Dalamud.Game.Text; using System; +using System.Collections.Generic; using System.Linq; using TextToTalk.UI; using TextToTalk.UI.Windows; @@ -15,6 +16,7 @@ public class OpenAiBackendUI private readonly OpenAiBackend backend; private string apiKey; + private SortedSet selectedStyleIndices = new SortedSet(); public OpenAiBackendUI(OpenAiBackendUIModel model, PluginConfiguration config, OpenAiBackend backend) { @@ -22,6 +24,7 @@ public OpenAiBackendUI(OpenAiBackendUIModel model, PluginConfiguration config, O this.model = model; this.apiKey = this.model.GetApiKey(); this.backend = backend; + } public void DrawLoginOptions() @@ -45,6 +48,7 @@ public void DrawLoginOptions() } + public void DrawVoicePresetOptions() { var currentVoicePreset = model.GetCurrentVoicePreset(); @@ -66,6 +70,7 @@ public void DrawVoicePresetOptions() .ToArray(); if (ImGui.Combo($"Voice preset##{MemoizedId.Create()}", ref currentPresetIndex, presetDisplayNames, presets.Count)) config.SetCurrentVoicePreset(presets[currentPresetIndex].Id); + currentVoicePreset.SyncSetFromString(); } else if (currentVoicePreset != null) { @@ -184,17 +189,38 @@ public void DrawVoicePresetOptions() } else { - var style = currentVoicePreset.Style; - voiceStyles.Insert(0, ""); - var styleIndex = voiceStyles.IndexOf(currentVoicePreset.Style ?? ""); - if (ImGui.Combo($"Voice Style##{MemoizedId.Create()}", ref styleIndex, voiceStyles, voiceStyles.Count)) + // 1. Generate the preview text directly from the set + string previewText = currentVoicePreset.Styles.Count > 0 + ? string.Join(", ", currentVoicePreset.Styles) + : "None selected"; + + // 2. Open the Combo + if (ImGui.BeginCombo($"Voice Style##{MemoizedId.Create()}", previewText)) { - currentVoicePreset.Style = voiceStyles[styleIndex]; - this.config.Save(); + foreach (var styleName in config.CustomVoiceStyles) + { + // Check if this style is currently in our preset's set + bool isSelected = currentVoicePreset.Styles.Contains(styleName); + + if (ImGui.Selectable(styleName, isSelected, ImGuiSelectableFlags.DontClosePopups)) + { + if (isSelected) + currentVoicePreset.Styles.Remove(styleName); + else + currentVoicePreset.Styles.Add(styleName); + + // 3. Save immediately + // Because 'Styles' is a reference type inside the preset, + // the save/reload won't "wipe" your local UI state anymore. + currentVoicePreset.SyncStringFromSet(); + this.config.Save(); + } + } + ImGui.EndCombo(); } } - Components.HelpTooltip(""" + Components.HelpTooltip(""" Styles are additional information that can be provided to the model to help it generate more accurate speech. This can include things like emphasis, pronunciation, pauses, tone, pacing, voice affect, inflections, word choice etc. Examples can be found at https://openai.fm diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiBackendUIModel.cs b/src/TextToTalk/Backends/OpenAI/OpenAiBackendUIModel.cs index baf707a8..1d8d0113 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiBackendUIModel.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiBackendUIModel.cs @@ -2,6 +2,9 @@ using System.Collections.Generic; using System.Net.Http; using System.Text.RegularExpressions; +using System.ClientModel; +using OpenAI; +using OpenAI.Models; // Ensure you have the Models namespace namespace TextToTalk.Backends.OpenAI; @@ -16,7 +19,9 @@ public class OpenAiBackendUIModel /// /// Gets the sound playback queue. /// - public StreamSoundQueue SoundQueue { get; } + public StreamingSoundQueue SoundQueue { get; } + + //public RawStreamingSoundQueue RawStreamingSoundQueue { get; } /// /// Gets the currently-instantiated OpenAI client instance. @@ -36,18 +41,20 @@ public class OpenAiBackendUIModel public OpenAiBackendUIModel(PluginConfiguration config, HttpClient http) { - SoundQueue = new StreamSoundQueue(config); - OpenAi = new OpenAiClient(SoundQueue, http); + SoundQueue = new StreamingSoundQueue(config); + var credentials = OpenAiCredentialManager.LoadCredentials(); + if (credentials != null) + { + apiKey = (credentials.Password); + } + //RawStreamingSoundQueue = new RawStreamingSoundQueue(config); + OpenAi = new OpenAiClient(SoundQueue, apiKey); this.config = config; this.apiKey = ""; // this.Voices = new Dictionary>(); - var credentials = OpenAiCredentialManager.LoadCredentials(); - if (credentials != null) - { - LoginWith(credentials.Password); - } + } /// @@ -88,27 +95,45 @@ public void SetCurrentVoicePreset(int id) this.config.Save(); } - private bool TryLogin(string testApiKey) +private bool TryLogin(string testApiKey) +{ + OpenAiLoginException = null; + var lastApiKey = this.apiKey; + + try { - OpenAiLoginException = null; - var lastApiKey = this.apiKey; - try - { - DetailedLog.Info("Testing OpenAI authorization status"); - OpenAi.ApiKey = testApiKey; - // This should throw an exception if the API key was incorrect - OpenAi.TestCredentials().GetAwaiter().GetResult(); - DetailedLog.Info("OpenAI authorization successful"); - return true; - } - catch (Exception e) - { - OpenAiLoginException = e; - OpenAi.ApiKey = lastApiKey; - DetailedLog.Error(e, "Failed to initialize OpenAI client"); - return false; - } + DetailedLog.Info("Testing OpenAI authorization status..."); + + // 1. Initialize a temporary client with the test key + // In the v2 SDK, you can use OpenAIModelClient for a cheap validation call + var modelClient = new OpenAIModelClient(new ApiKeyCredential(testApiKey)); + + // 2. Perform a 'List Models' call. + // This is a free metadata call that requires valid authentication. + // Use GetModels() to verify credentials. + _ = modelClient.GetModels(); + + // 3. If successful, update the primary ApiKey and return true + this.apiKey = testApiKey; + DetailedLog.Info("OpenAI authorization successful."); + return true; + } + catch (ClientResultException e) + { + // Specifically catch SDK-based authentication or client errors + OpenAiLoginException = e; + this.apiKey = lastApiKey; + DetailedLog.Error(e, $"OpenAI authorization failed: {e.Status} {e.Message}"); + return false; + } + catch (Exception e) + { + OpenAiLoginException = e; + this.apiKey = lastApiKey; + DetailedLog.Error(e, "An unexpected error occurred during OpenAI initialization."); + return false; } +} public void Dispose() { diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiClient.cs b/src/TextToTalk/Backends/OpenAI/OpenAiClient.cs index 6c948689..cfbb357e 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiClient.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiClient.cs @@ -1,261 +1,123 @@ -using System; +using NAudio.CoreAudioApi; +using OpenAI; +using Serilog; +using System; +using System.ClientModel; +using System.ClientModel.Primitives; using System.Collections.Generic; using System.IO; using System.Linq; -using System.Net; using System.Net.Http; -using System.Text; using System.Text.Json; +using System.Threading; using System.Threading.Tasks; -using System.Text.RegularExpressions; -using TextToTalk.GameEnums; -using Serilog; +using OpenAIAudio = OpenAI.Audio; namespace TextToTalk.Backends.OpenAI; -public class OpenAiClient(StreamSoundQueue soundQueue, HttpClient http) +public class OpenAiClient { - private const string UrlBase = "https://api.openai.com"; + private readonly OpenAIClient _openAiClient; + private readonly StreamingSoundQueue _soundQueue; + public CancellationTokenSource? _ttsCts; + // --- Provided Definitions --- public record ModelConfig( - string ModelName, - IReadOnlyDictionary Voices, - bool InstructionsSupported, - bool SpeedSupported); + string ModelName, + IReadOnlyDictionary Voices, + bool InstructionsSupported, + bool SpeedSupported); private static readonly Dictionary VoiceLabels = new() -{ - { "alloy", "Alloy (Neutral & Balanced)" }, - { "ash", "Ash (Clear & Precise)" }, - { "ballad", "Ballad (Melodic & Smooth)" }, - { "coral", "Coral (Warm & Friendly)" }, - { "echo", "Echo (Resonant & Deep)" }, - { "fable", "Fable (Alto Narrative)" }, - { "onyx", "Onyx (Deep & Energetic)" }, - { "nova", "Nova (Bright & Energetic)" }, - { "sage", "Sage (Calm & Thoughtful)" }, - { "shimmer", "Shimmer (Bright & Feminine)" }, - { "verse", "Verse (Versatile & Expressive)" }, - { "marin", "Marin (Latest and Greatest)" }, - { "cedar", "Cedar (Latest and Greatest)" } -}; + { + { "alloy", "Alloy (Neutral & Balanced)" }, + { "ash", "Ash (Clear & Precise)" }, + { "ballad", "Ballad (Melodic & Smooth)" }, + { "coral", "Coral (Warm & Friendly)" }, + { "echo", "Echo (Resonant & Deep)" }, + { "fable", "Fable (Alto Narrative)" }, + { "onyx", "Onyx (Deep & Energetic)" }, + { "nova", "Nova (Bright & Energetic)" }, + { "sage", "Sage (Calm & Thoughtful)" }, + { "shimmer", "Shimmer (Bright & Feminine)" }, + { "verse", "Verse (Versatile & Expressive)" }, + { "marin", "Marin (Latest and Greatest)" }, + { "cedar", "Cedar (Latest and Greatest)" } + }; public static readonly List Models = [ - new("gpt-4o-mini-tts", - VoiceLabels.ToDictionary(v => v.Key, v => v.Value), - true, false), - - new("tts-1", - VoiceLabels.Where(v => v.Key != "ballad" && v.Key != "verse") - .ToDictionary(v => v.Key, v => v.Value), - false, true), - - new("tts-1-hd", - VoiceLabels.Where(v => v.Key != "ballad" && v.Key != "verse") - .ToDictionary(v => v.Key, v => v.Value), - false, false) + new("gpt-4o-mini-tts", VoiceLabels.ToDictionary(v => v.Key, v => v.Value), true, true), + new("tts-1", VoiceLabels.Where(v => v.Key != "ballad" && v.Key != "verse").ToDictionary(v => v.Key, v => v.Value), false, true), + new("tts-1-hd", VoiceLabels.Where(v => v.Key != "ballad" && v.Key != "verse").ToDictionary(v => v.Key, v => v.Value), false, true) ]; - // public record ModelConfig(string ModelName, IReadOnlySet Voices, bool InstructionsSupported, bool SpeedSupported); - // - // public static readonly List Models = - // [ - // // Note: while speed is 'technically' supported by gpt-4o-mini-tts, it doesn't appear to influence the output. - // new("gpt-4o-mini-tts", new HashSet - // { - // "alloy", - // "ash", - // "ballad", - // "coral", - // "echo", - // "fable", - // "onyx", - // "nova", - // "sage", - // "shimmer", - // "verse" - // }, true, false), - // new("tts-1", new HashSet - // { - // "nova", - // "shimmer", - // "echo", - // "onyx", - // "fable", - // "alloy", - // "ash", - // "sage", - // "coral" - // }, false, true), - // new("tts-1-hd", new HashSet - // { - // "nova", - // "shimmer", - // "echo", - // "onyx", - // "fable", - // "alloy", - // "ash", - // "sage", - // "coral" - // }, false, false), - // ]; - public string? ApiKey { get; set; } - private void AddAuthorization(HttpRequestMessage req) + // --- Implementation --- + public OpenAiClient(StreamingSoundQueue soundQueue, string apiKey) { - req.Headers.Add("Authorization", $"Bearer {ApiKey}"); - } - - private bool IsAuthorizationSet() - { - return ApiKey is { Length: > 0 }; - } + _soundQueue = soundQueue; + ApiKey = apiKey; - public async Task TestCredentials() - { - if (!IsAuthorizationSet()) + if (!string.IsNullOrWhiteSpace(apiKey)) { - throw new OpenAiMissingCredentialsException("No OpenAI authorization keys have been configured."); + _openAiClient = new OpenAIClient(apiKey); } - - var uriBuilder = new UriBuilder(UrlBase) { Path = "/v1/models" }; - using var req = new HttpRequestMessage(HttpMethod.Get, uriBuilder.Uri); - AddAuthorization(req); - - var res = await http.SendAsync(req); - await EnsureSuccessStatusCode(res); } - public string? GetInstructionsForRequest(SayRequest request, OpenAiVoicePreset preset) + public async Task Say(string text, string modelName, string voiceId, string? instructions, float speed, float volume) { - var instructionBuilder = new StringBuilder(); - instructionBuilder.AppendLine($"Tone: Final fantasy 14 character named {request.Speaker}"); - if (request.Race is {Length: > 0}) - { - instructionBuilder.AppendLine($"Race: {request.Race}"); - } + if (_openAiClient == null) return; - if (request.BodyType is not BodyType.Unknown) - { - instructionBuilder.AppendLine($"BodyType: {request.BodyType}"); - } + // Cancel any previous request before starting a new one + _ttsCts?.Cancel(); + _ttsCts = new CancellationTokenSource(); + var token = _ttsCts.Token; - if (preset.Style is {Length: > 0}) + try { - instructionBuilder.AppendLine($"Instructions: {(!string.IsNullOrEmpty(request.Style) ? request.Style : preset.Style)}"); // Style tags from Say Request take precedence over Style tags from voice preset. - } - - var instructions = instructionBuilder.ToString() - .Trim(); - - return instructions.Length > 0 ? instructions : null; - } - - public async Task Say(OpenAiVoicePreset preset, SayRequest request, string text, string style) - { - if (!IsAuthorizationSet()) - { - throw new OpenAiMissingCredentialsException("No OpenAI authorization keys have been configured."); - } - - var uriBuilder = new UriBuilder(UrlBase) { Path = "/v1/audio/speech" }; - using var req = new HttpRequestMessage(HttpMethod.Post, uriBuilder.Uri); - AddAuthorization(req); + OpenAIAudio.AudioClient audioClient = _openAiClient.GetAudioClient(modelName); - string model; - string voice; - if (preset.Model != null && Models.Any(m => m.ModelName == preset.Model)) + var requestBody = new Dictionary { - model = preset.Model; - } - else - { - model = Models.First().ModelName; - } - - if (request.Style is {Length: > 0 }) - { - model = "gpt-4o-mini-tts"; // Force Say request to model that can handle Voice Styles if user has embedded a style tag into their message - } - - var modelConfig = Models.First(m => m.ModelName == model); - if (preset.VoiceName != null && modelConfig.Voices.Keys.Contains(preset.VoiceName)) - { - voice = preset.VoiceName; - } - else - { - voice = modelConfig.Voices.Keys.First(); - } - - Dictionary args = new() - { - ["model"] = model, - ["input"] = text, - ["voice"] = voice, - ["response_format"] = "mp3", - ["speed"] = modelConfig.SpeedSupported ? preset.PlaybackRate ?? 1.0f : 1.0f + { "model", modelName }, + { "input", text }, + { "voice", voiceId.ToLowerInvariant() }, + { "response_format", "mp3" }, + { "speed", speed } }; - if (modelConfig.InstructionsSupported) - { - string? configinstructions = GetInstructionsForRequest(request, preset); - //if (style != "") - //{ - // args["instructions"] = style; - //} - // Instructions from style take precedence over preset instructions. - if (configinstructions != null) + if (Models.First(m => m.ModelName == "gpt-4o-mini-tts").InstructionsSupported) { - args["instructions"] = configinstructions; + requestBody["instructions"] = instructions ?? ""; } - } - var json = JsonSerializer.Serialize(args); - DetailedLog.Verbose(json); - using var content = new StringContent(json, Encoding.UTF8, "application/json"); - req.Content = content; - var res = await http.SendAsync(req); - await EnsureSuccessStatusCode(res); + BinaryContent content = BinaryContent.Create(BinaryData.FromObjectAsJson(requestBody)); + RequestOptions options = new(); + options.BufferResponse = false; - var mp3Stream = new MemoryStream(); - var responseStream = await res.Content.ReadAsStreamAsync(); - await responseStream.CopyToAsync(mp3Stream); - mp3Stream.Seek(0, SeekOrigin.Begin); + // PASS THE TOKEN HERE + options.CancellationToken = token; - soundQueue.EnqueueSound(mp3Stream, request.Source, StreamFormat.Mp3, preset.Volume); - } + // The request will throw OperationCanceledException if cancelled during the call + ClientResult result = await audioClient.GenerateSpeechAsync(content, options); - private static async Task EnsureSuccessStatusCode(HttpResponseMessage res) - { - if (res.StatusCode is HttpStatusCode.Unauthorized or HttpStatusCode.Forbidden) + Stream liveAudioStream = result.GetRawResponse().ContentStream; + + // Register a callback to close the stream if cancellation happens while reading + token.Register(() => liveAudioStream.Close()); + + Log.Information("Queuing Sound"); + _soundQueue.EnqueueSound(liveAudioStream, TextSource.None, volume, StreamFormat.Mp3, null); + } + catch (OperationCanceledException) { - throw new OpenAiUnauthorizedException(res.StatusCode, "Unauthorized request."); + Log.Information("OpenAI Speech generation was cancelled by the user."); } - - if (!res.IsSuccessStatusCode) + catch (Exception ex) { - try - { - var content = await res.Content.ReadAsStringAsync(); - DetailedLog.Debug(content); - - var error = JsonSerializer.Deserialize(content); - if (error?.Error != null) - { - throw new OpenAiFailedException(res.StatusCode, error.Error, - $"Request failed with status code {error.Error.Code}: {error.Error.Message}"); - } - } - catch (Exception e) when (e is not OpenAiFailedException) - { - DetailedLog.Error(e, "Failed to parse OpenAI error response."); - } - - throw new OpenAiFailedException(res.StatusCode, null, $"Request failed with status code {res.StatusCode}."); + Log.Error(ex, "OpenAI Streaming Speech generation failed."); } } -} \ No newline at end of file +} diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiVoicePreset.cs b/src/TextToTalk/Backends/OpenAI/OpenAiVoicePreset.cs index 8fe4b867..14898a30 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiVoicePreset.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiVoicePreset.cs @@ -1,4 +1,5 @@ -using System.Linq; +using System.Collections.Generic; +using System.Linq; using System.Text.Json.Serialization; namespace TextToTalk.Backends.OpenAI; @@ -13,8 +14,26 @@ public class OpenAiVoicePreset : VoicePreset public float? PlaybackRate { get; set; } [JsonPropertyName("OpenAIVoiceName")] public string? VoiceName { get; set; } - - public string? Style { get; set; } + + public string? Style { get; set; } = ""; + + [JsonIgnore] public SortedSet Styles { get; set; } = new SortedSet(); + + + public void SyncSetFromString() + { + Styles.Clear(); + if (string.IsNullOrWhiteSpace(Style)) return; + + foreach (var s in Style.Split(", ")) + Styles.Add(s); + } + + // Call this whenever the UI changes the Set to update the String + public void SyncStringFromSet() + { + Style = string.Join(", ", Styles); + } public override bool TrySetDefaultValues() { diff --git a/src/TextToTalk/Backends/Polly/PollyClient.cs b/src/TextToTalk/Backends/Polly/PollyClient.cs index 716a3a14..1202e8f5 100644 --- a/src/TextToTalk/Backends/Polly/PollyClient.cs +++ b/src/TextToTalk/Backends/Polly/PollyClient.cs @@ -2,9 +2,11 @@ using Amazon.Polly; using Amazon.Polly.Model; using Amazon.Runtime; +using Serilog; using System; using System.Collections.Generic; using System.IO; +using System.Threading; using System.Threading.Tasks; using TextToTalk.Lexicons; @@ -13,15 +15,17 @@ namespace TextToTalk.Backends.Polly public class PollyClient : IDisposable { private readonly AmazonPollyClient client; - private readonly StreamSoundQueue soundQueue; + private readonly StreamingSoundQueue soundQueue; private readonly LexiconManager lexiconManager; private readonly PluginConfiguration config; + public CancellationTokenSource? _TtsCts; + public PollyClient(string accessKey, string secretKey, RegionEndpoint region, LexiconManager lexiconManager, PluginConfiguration config) { var credentials = new BasicAWSCredentials(accessKey, secretKey); this.client = new AmazonPollyClient(credentials, region); - this.soundQueue = new StreamSoundQueue(config); + this.soundQueue = new StreamingSoundQueue(config); this.lexiconManager = lexiconManager; } @@ -53,6 +57,12 @@ public TextSource GetCurrentlySpokenTextSource() public async Task Say(Engine engine, VoiceId voice, string? amazonDomainName, int sampleRate, int playbackRate, float volume, TextSource source, string text) { + _TtsCts?.Cancel(); + _TtsCts?.Dispose(); + + _TtsCts = new CancellationTokenSource(); + var ct = _TtsCts.Token; + if (!string.IsNullOrEmpty(amazonDomainName)) { text = $"{text}"; @@ -71,33 +81,48 @@ public async Task Say(Engine engine, VoiceId voice, string? amazonDomainName, in TextType = TextType.Ssml, }; - SynthesizeSpeechResponse res; try { - res = await this.client.SynthesizeSpeechAsync(req); + // Using 'using' ensures the response (and its stream) is disposed after the queue handles it + var res = await this.client.SynthesizeSpeechAsync(req, ct); + + // Pass the live AudioStream directly to the queue. + // Ensure EnqueueSound is updated to process the stream as it arrives. + this.soundQueue.EnqueueSound(res.AudioStream, source, volume, StreamFormat.Mp3, null); + } + catch (OperationCanceledException) + { + // 2026 Best Practice: Catch the cancellation exception to prevent it + // from bubbling up as a generic error. + Log.Information("TTS generation was cancelled."); } catch (Exception e) { DetailedLog.Error(e, "Synthesis request failed in {0}.", nameof(PollyClient)); - return; } - var responseStream = new MemoryStream(); - await res.AudioStream.CopyToAsync(responseStream); - responseStream.Seek(0, SeekOrigin.Begin); - - this.soundQueue.EnqueueSound(responseStream, source, StreamFormat.Mp3, volume); } public Task CancelAllSounds() { this.soundQueue.CancelAllSounds(); + if (this._TtsCts != null) + { + this._TtsCts.Cancel(); + } + this.soundQueue.StopHardware(); return Task.CompletedTask; } public Task CancelFromSource(TextSource source) { this.soundQueue.CancelFromSource(source); + this.soundQueue.CancelAllSounds(); + if (this._TtsCts != null) + { + this._TtsCts.Cancel(); + } + this.soundQueue.StopHardware(); return Task.CompletedTask; } diff --git a/src/TextToTalk/Backends/SoundQueue.cs b/src/TextToTalk/Backends/SoundQueue.cs index 387ca2ec..3046f58a 100644 --- a/src/TextToTalk/Backends/SoundQueue.cs +++ b/src/TextToTalk/Backends/SoundQueue.cs @@ -1,4 +1,5 @@ -using System; +using Serilog; +using System; using System.Collections.Generic; using System.Linq; using System.Threading; diff --git a/src/TextToTalk/Backends/StreamFormat.cs b/src/TextToTalk/Backends/StreamFormat.cs index 2284855a..f6ff90a3 100644 --- a/src/TextToTalk/Backends/StreamFormat.cs +++ b/src/TextToTalk/Backends/StreamFormat.cs @@ -5,4 +5,7 @@ public enum StreamFormat Mp3, Wave, Raw, + Azure, + System, + Uberduck, } \ No newline at end of file diff --git a/src/TextToTalk/Backends/StreamSoundQueue.cs b/src/TextToTalk/Backends/StreamSoundQueue.cs index e1894991..3755c96a 100644 --- a/src/TextToTalk/Backends/StreamSoundQueue.cs +++ b/src/TextToTalk/Backends/StreamSoundQueue.cs @@ -1,5 +1,6 @@ using NAudio.Wave; using NAudio.Wave.SampleProviders; +using Serilog; using System; using System.IO; using System.Threading; @@ -32,6 +33,7 @@ protected override void OnSoundLoop(StreamSoundQueueItem nextItem) // Play the sound lock (this.soundLock) { + Log.Information("Playing"); this.soundOut = new DirectSoundOut(playbackDeviceId); this.soundOut.PlaybackStopped += (_, _) => { this.speechCompleted.Set(); }; this.soundOut.Init(volumeSampleProvider); diff --git a/src/TextToTalk/Backends/StreamSoundQueueItem.cs b/src/TextToTalk/Backends/StreamSoundQueueItem.cs index 3ae5826e..a9b2c96d 100644 --- a/src/TextToTalk/Backends/StreamSoundQueueItem.cs +++ b/src/TextToTalk/Backends/StreamSoundQueueItem.cs @@ -1,4 +1,6 @@ using System.IO; +using System.Net.Http; +using static TextToTalk.Backends.System.SystemSoundQueue; namespace TextToTalk.Backends { @@ -10,14 +12,44 @@ public class StreamSoundQueueItem : SoundQueueItem public StreamFormat Format { get; init; } + protected override void Dispose(bool disposing) { if (disposing) { - Data.Dispose(); + try + { + Data.Dispose(); + } + catch { } + + base.Dispose(disposing); } + } + + public class StreamingSoundQueueItem : SoundQueueItem + { + public Stream Data { get; init; } - base.Dispose(disposing); + public float Volume { get; init; } + + public StreamFormat Format { get; init; } + public HttpResponseMessage? Response { get; set; } + public bool Aborted { get; set; } + + protected override void Dispose(bool disposing) + { + if (disposing) + { + try + { + Data.Dispose(); + } + catch { } + + base.Dispose(disposing); + } + } } } } \ No newline at end of file diff --git a/src/TextToTalk/Backends/StreamingSoundQueue.cs b/src/TextToTalk/Backends/StreamingSoundQueue.cs new file mode 100644 index 00000000..31996af1 --- /dev/null +++ b/src/TextToTalk/Backends/StreamingSoundQueue.cs @@ -0,0 +1,335 @@ +using NAudio.CoreAudioApi; +using NAudio.Wave; +using NAudio.Wave.SampleProviders; +using Serilog; +using System; +using System.Collections.Generic; +using System.IO; +using System.Net.Http; +using System.Net.Sockets; +using System.Speech.Synthesis; +using System.Threading; +using System.Threading.Tasks; +using TextToTalk.Backends.System; +using TextToTalk.Lexicons; +using static TextToTalk.Backends.StreamSoundQueueItem; + +namespace TextToTalk.Backends +{ + + public class StreamingSoundQueue(PluginConfiguration config) : SoundQueue + { + // WASAPI Hardware Members + private WasapiOut? soundOut; + private BufferedWaveProvider? bufferedProvider; + private VolumeSampleProvider? volumeProvider; + private readonly object soundLock = new(); + + // 1. Unified Audio Configuration + private static readonly WaveFormat Uberduck = new(22050, 16, 1); + private static readonly WaveFormat Wave = new(24000, 16, 1); + private static readonly WaveFormat Mp3 = new(24000, 16, 1); + private static readonly WaveFormat Azure = new(16000, 16, 1); + + private bool _isDisposed; + public CancellationTokenSource? _ttsCts; + + public void EnqueueSound(Stream data, TextSource source, float volume, StreamFormat format, HttpResponseMessage? response) + { + AddQueueItem(new StreamingSoundQueueItem + { + Data = data, + Source = source, + Volume = volume, + Format = format, + Response = response, + }); + } + + protected override void OnSoundLoop(StreamingSoundQueueItem nextItem) + { + // 1. Handle Seekable vs Network Streams + if (nextItem.Data.CanSeek) + { + nextItem.Data.Position = 0; + } + + // 2. Branch logic based on format (Encoded vs Raw) + if (nextItem.Format == StreamFormat.Mp3 || nextItem.Format == StreamFormat.Uberduck) + { + ProcessMp3Stream(nextItem); + } + else + { + ProcessRawPcmStream(nextItem); + } + } + + private void ProcessMp3Stream(StreamingSoundQueueItem nextItem) + { + Log.Information("Playing as Mp3)"); + IMp3FrameDecompressor decompressor = null; + try + { + // Wrap the network stream to support forward-only Position tracking + // and prevent partial-read exceptions in LoadFromStream. + using var readFullyStream = new ReadFullyStream(nextItem.Data); + Log.Information("Processing as MP3"); + while (true) + { + + Mp3Frame frame; + try + { + frame = Mp3Frame.LoadFromStream(readFullyStream); + } + catch (Exception) // Catching interruptions here + { + break; + } + + if (frame == null) break; + + if (decompressor == null) + { + WaveFormat mp3Format = new Mp3WaveFormat(frame.SampleRate, + frame.ChannelMode == ChannelMode.Mono ? 1 : 2, + frame.FrameLength, frame.BitRate); + + decompressor = new AcmMp3FrameDecompressor(mp3Format); + + lock (this.soundLock) + { + EnsureHardwareInitialized(decompressor.OutputFormat); + } + } + + byte[] decompressedBuffer = new byte[16384 * 2]; + int decompressedBytes = decompressor.DecompressFrame(frame, decompressedBuffer, 0); + ApplyVolumeToPcmBuffer(decompressedBuffer, decompressedBytes, nextItem.Volume); + + if (decompressedBytes > 0) + { + lock (this.soundLock) + { + + this.bufferedProvider.AddSamples(decompressedBuffer, 0, decompressedBytes); + + if (this.bufferedProvider.BufferedBytes > 4096 && + this.soundOut.PlaybackState != PlaybackState.Playing) + { + this.soundOut.Play(); + } + } + } + } + } + catch (Exception ex) + { + Log.Error(ex, "Error during real-time ACM decompression"); + } + finally + { + decompressor?.Dispose(); + nextItem.Data.Dispose(); + } + } + + private void ProcessRawPcmStream(StreamingSoundQueueItem nextItem) + { + Log.Information("Playing as raw PCM"); + // Resolve format for raw PCM types + WaveFormat chunkFormat = nextItem.Format switch + { + StreamFormat.Wave => Wave, + StreamFormat.Azure => Azure, + _ => throw new NotSupportedException($"Format {nextItem.Format} requires a decompressor."), + }; + + lock (this.soundLock) + { + EnsureHardwareInitialized(chunkFormat); + + // Read in chunks to avoid calling .Length on network streams + byte[] chunkBuffer = new byte[16384]; + int bytesRead; + while ((bytesRead = nextItem.Data.Read(chunkBuffer, 0, chunkBuffer.Length)) > 0) + { + ApplyVolumeToPcmBuffer(chunkBuffer, bytesRead, nextItem.Volume); + + this.bufferedProvider.AddSamples(chunkBuffer, 0, bytesRead); + + if (this.bufferedProvider.BufferedBytes > 512 && this.soundOut.PlaybackState != PlaybackState.Playing) + { + this.soundOut.Play(); + } + } + } + nextItem.Data.Dispose(); + } + + private void EnsureHardwareInitialized(WaveFormat format) + { + if (this.soundOut == null || !this.bufferedProvider.WaveFormat.Equals(format)) + { + this.StopHardware(); + this.bufferedProvider = new BufferedWaveProvider(format) { ReadFully = true }; + this.bufferedProvider.BufferDuration = TimeSpan.FromSeconds(30); + + var mmDevice = GetWasapiDeviceFromGuid(config.SelectedAudioDeviceGuid); + this.soundOut = new WasapiOut(mmDevice, AudioClientShareMode.Shared, false, 50); + this.soundOut.Init(this.bufferedProvider); + } + } + + private MMDevice GetWasapiDeviceFromGuid(Guid targetGuid) + { + using var enumerator = new MMDeviceEnumerator(); + var devices = enumerator.EnumerateAudioEndPoints(DataFlow.Render, DeviceState.Active); + foreach (var device in devices) + { + if (device.Properties.Contains(PropertyKeys.PKEY_AudioEndpoint_GUID)) + { + var guidString = device.Properties[PropertyKeys.PKEY_AudioEndpoint_GUID].Value as string; + if (Guid.TryParse(guidString, out var deviceGuid) && deviceGuid == targetGuid) + return device; + } + } + return enumerator.GetDefaultAudioEndpoint(DataFlow.Render, Role.Console); + } + protected override void OnSoundCancelled() + { + StopHardware(); + } + public override void CancelAllSounds() + { + StopHardware(); + base.CancelAllSounds(); + } + + public void StopHardware() + { + lock (this.soundLock) + { + if (this.soundOut != null) + { + this.soundOut.Stop(); + Thread.Sleep(10); + this.soundOut.Dispose(); + this.soundOut = null; + } + if (this.bufferedProvider != null) + { + this.bufferedProvider.ClearBuffer(); + this.bufferedProvider = null; + } + } + + } + protected override void Dispose(bool disposing) + { + if (_isDisposed) return; + _isDisposed = true; // Signal all loops to stop immediately + + if (disposing) + { + try + { + soundOut?.Stop(); + } + catch (Exception ex) + { + DetailedLog.Error(ex, "Error during early shutdown phase"); + } + + base.Dispose(disposing); // Clean up the queue thread + soundOut?.Dispose(); + } + } + private void ApplyVolumeToPcmBuffer(byte[] buffer, int bytesRead, float volume) + { + // Skip calculation if volume is 100% to save CPU + if (Math.Abs(volume - 1.0f) < 0.001f) return; + + for (int i = 0; i < bytesRead; i += 2) + { + // 1. Combine two bytes into one 16-bit signed integer (short) + short sample = (short)((buffer[i + 1] << 8) | buffer[i]); + + // 2. Scale the sample value by the volume float (0.0 to 1.0) + float scaledSample = sample * volume; + + // 3. Clamp the value to ensure it stays within 16-bit bounds to prevent "pops" + if (scaledSample > short.MaxValue) scaledSample = short.MaxValue; + if (scaledSample < short.MinValue) scaledSample = short.MinValue; + + short finalSample = (short)scaledSample; + + // 4. Split the 16-bit sample back into two bytes and store in the buffer + buffer[i] = (byte)(finalSample & 0xFF); + buffer[i + 1] = (byte)((finalSample >> 8) & 0xFF); + } + } + } + + public class ReadFullyStream : Stream + { + private readonly Stream sourceStream; + private long pos; + + public ReadFullyStream(Stream sourceStream) + { + this.sourceStream = sourceStream; + } + + public override int Read(byte[] buffer, int offset, int count) + { + int totalBytesRead = 0; + try + { + while (totalBytesRead < count) + { + int bytesRead = sourceStream.Read(buffer, offset + totalBytesRead, count - totalBytesRead); + if (bytesRead == 0) break; + totalBytesRead += bytesRead; + } + } + catch (Exception ex) when (ex.InnerException is SocketException se && se.NativeErrorCode == 10053) + { + // "An established connection was aborted by the software in your host machine" + // This happens when we cancel the TTS request. Return 0 to signal End of Stream. + return 0; + } + catch (IOException) + { + // General network interruption during skip + return 0; + } + + pos += totalBytesRead; + return totalBytesRead; + } + + // Required for the class to compile + public override void Write(byte[] buffer, int offset, int count) + => throw new NotSupportedException(); + + public override bool CanRead => true; + public override bool CanSeek => false; + public override bool CanWrite => false; + public override long Length => throw new NotSupportedException(); + + // We provide a getter for Position so Mp3Frame can track its progress, + // but the setter is not supported. + public override long Position + { + get => pos; + set => throw new NotSupportedException(); + } + + public override void Flush() { } + public override long Seek(long offset, SeekOrigin origin) => throw new NotSupportedException(); + public override void SetLength(long value) => throw new NotSupportedException(); + } + +} diff --git a/src/TextToTalk/Backends/System/SystemBackend.cs b/src/TextToTalk/Backends/System/SystemBackend.cs index 41333582..d37accce 100644 --- a/src/TextToTalk/Backends/System/SystemBackend.cs +++ b/src/TextToTalk/Backends/System/SystemBackend.cs @@ -59,6 +59,7 @@ protected override void Dispose(bool disposing) { this.voiceExceptions.Dispose(); this.soundQueue.Dispose(); + } } } diff --git a/src/TextToTalk/Backends/System/SystemSoundQueue.cs b/src/TextToTalk/Backends/System/SystemSoundQueue.cs index f81bd83d..3650b741 100644 --- a/src/TextToTalk/Backends/System/SystemSoundQueue.cs +++ b/src/TextToTalk/Backends/System/SystemSoundQueue.cs @@ -1,5 +1,10 @@ -using R3; +using NAudio.CoreAudioApi; +using NAudio.Wave; +using NAudio.Wave.SampleProviders; +using R3; +using Serilog; using System; +using System.Collections.Generic; using System.IO; using System.Speech.Synthesis; using System.Threading; @@ -10,125 +15,253 @@ namespace TextToTalk.Backends.System { public class SystemSoundQueue : SoundQueue { - private MemoryStream stream; - private readonly SpeechSynthesizer speechSynthesizer; - private readonly LexiconManager lexiconManager; - private readonly StreamSoundQueue streamSoundQueue; - private readonly SystemBackend backend; - private readonly PluginConfiguration config; - private int soundLock; - private readonly SemaphoreSlim deviceLock = new SemaphoreSlim(1, 1); + // WASAPI Hardware Members + private WasapiOut? soundOut; + private BufferedWaveProvider? bufferedProvider; + private VolumeSampleProvider? volumeProvider; + private readonly object soundLock = new(); - public Observable SelectVoiceFailed => selectVoiceFailed; - private readonly Subject selectVoiceFailed; - private bool isSynthesizing = false; + // 1. Unified Audio Configuration + private static readonly WaveFormat SystemFormat = new(22050, 16, 1); + private readonly SpeechSynthesizer _speechSynthesizer; + private readonly LexiconManager _lexiconManager; + private readonly PluginConfiguration _config; - public async void ASyncSpeak(SpeechSynthesizer synth, string textToSpeak) - { - await Task.Run(() => synth.SpeakSsml(textToSpeak)); - } + private readonly Subject _selectVoiceFailed = new(); + public Observable SelectVoiceFailed => _selectVoiceFailed; + private bool _isDisposed; - public SystemSoundQueue(LexiconManager lexiconManager, PluginConfiguration config) + private readonly Dictionary _synthPool = new(); + + private SpeechSynthesizer GetSynthesizerForVoice(string voiceName) { - this.streamSoundQueue = new StreamSoundQueue(config); - this.lexiconManager = lexiconManager; - this.speechSynthesizer = new SpeechSynthesizer(); - this.selectVoiceFailed = new Subject(); + if (!_synthPool.TryGetValue(voiceName, out var synth)) + { + synth = new SpeechSynthesizer(); + synth.SelectVoice(voiceName); + // Pre-link the bridge for this specific synth + _synthPool[voiceName] = synth; + } + return synth; } - public void EnqueueSound(VoicePreset preset, TextSource source, string text) { AddQueueItem(new SystemSoundQueueItem { Preset = preset, - Text = text, Source = source, + Text = text, }); } - protected override async void OnSoundLoop(SystemSoundQueueItem nextItem) + public SystemSoundQueue(LexiconManager lexiconManager, PluginConfiguration config) + { + _lexiconManager = lexiconManager; + _config = config; + _speechSynthesizer = new SpeechSynthesizer(); + + } + + + + private MMDevice GetWasapiDeviceFromGuid(Guid targetGuid) { - if (nextItem.Preset is not SystemVoicePreset systemVoicePreset) + using var enumerator = new MMDeviceEnumerator(); + var devices = enumerator.EnumerateAudioEndPoints(DataFlow.Render, DeviceState.Active); + foreach (var device in devices) { - throw new InvalidOperationException("Invalid voice preset provided."); + if (device.Properties.Contains(PropertyKeys.PKEY_AudioEndpoint_GUID)) + { + var guidString = device.Properties[PropertyKeys.PKEY_AudioEndpoint_GUID].Value as string; + if (Guid.TryParse(guidString, out var deviceGuid) && deviceGuid == targetGuid) + return device; + } } + return enumerator.GetDefaultAudioEndpoint(DataFlow.Render, Role.Console); + } - try + protected override void OnSoundLoop(SystemSoundQueueItem nextItem) + { + if (nextItem.Preset is not SystemVoicePreset preset || nextItem.Aborted) return; + + // 1. Mimic Kokoro: Shared Hardware Setup + lock (this.soundLock) { - this.speechSynthesizer.UseVoicePreset(nextItem.Preset); + if (this.soundOut == null) + { + var mmDevice = GetWasapiDeviceFromGuid(_config.SelectedAudioDeviceGuid); + // Match the voice's expected format (SAPI default is 22050Hz, 16-bit, Mono) + this.bufferedProvider = new BufferedWaveProvider(new WaveFormat(22050, 16, 1)) + { + ReadFully = true, // Prevents WASAPI from stopping on empty buffer + BufferDuration = TimeSpan.FromSeconds(30) + }; + this.soundOut = new WasapiOut(mmDevice, AudioClientShareMode.Shared, false, 50); + this.soundOut.Init(this.bufferedProvider); + } } - catch (SelectVoiceFailedException e) + + // 1.5 Instant Voice Switching via Pool + if (!_synthPool.TryGetValue(preset.VoiceName, out var synth)) { - DetailedLog.Error(e, "Failed to select voice {0}", systemVoicePreset.VoiceName ?? ""); - this.selectVoiceFailed.OnNext(e); + synth = new SpeechSynthesizer(); + synth.SelectVoice(preset.VoiceName); + _synthPool[preset.VoiceName] = synth; } - var ssml = this.lexiconManager.MakeSsml(nextItem.Text, - langCode: this.speechSynthesizer.Voice.Culture.IetfLanguageTag); - DetailedLog.Verbose(ssml); + synth.Volume = preset.Volume; + synth.Rate = preset.Rate; - try + // 2. Prepare Synthesis + if (_speechSynthesizer.Voice.Name != preset.VoiceName) { - isSynthesizing = true; + _speechSynthesizer.SelectVoice(preset.VoiceName); + } + _speechSynthesizer.Volume = preset.Volume; + _speechSynthesizer.Rate = preset.Rate; - await deviceLock.WaitAsync(); + var ssml = _lexiconManager.MakeSsml(nextItem.Text, langCode: _speechSynthesizer.Voice.Culture.IetfLanguageTag); - this.stream = new MemoryStream(); - this.speechSynthesizer.SetOutputToWaveStream(this.stream); + // 3. Start Synthesis in Background (Feeding the buffer via bridge) + using var bridge = new SynthesisBridgeStream(this.bufferedProvider!); + _speechSynthesizer.SetOutputToWaveStream(bridge); - await Task.Run(() => this.speechSynthesizer.SpeakSsml(ssml)); + // Use SpeakAsync to avoid blocking the loop + var synthPrompt = _speechSynthesizer.SpeakSsmlAsync(ssml); - } - catch (OperationCanceledException) + // 4. This loop remains active as long as synthesis is running or audio is playing + while (!nextItem.Aborted && (!synthPrompt.IsCompleted || this.bufferedProvider?.BufferedBytes > 44)) { - + if (this.bufferedProvider?.BufferedBytes > 512) // Pre-roll threshold + { + if (this.soundOut?.PlaybackState != PlaybackState.Playing) + { + this.soundOut?.Play(); + Log.Information("Playing"); + } + } } - finally - { - isSynthesizing = false; - deviceLock.Release(); - } + // 5. Cleanup current item - this.stream.Seek(0, SeekOrigin.Begin); - this.streamSoundQueue.EnqueueSound(stream, nextItem.Source, StreamFormat.Wave, 1f); + this.StopHardware(); } - public override void CancelAllSounds() + // Custom Stream to pipe synthesizer output directly to NAudio buffer + private class SynthesisBridgeStream : Stream { - base.CancelAllSounds(); - this.streamSoundQueue.CancelAllSounds(); + private readonly BufferedWaveProvider _target; + private int _bytesToSkip = 0; + private bool _headerSkipped = false; + private long _position = 0; + + public SynthesisBridgeStream(BufferedWaveProvider target) => _target = target; + + public override void Write(byte[] buffer, int offset, int count) + { + if (_bytesToSkip > 0) + { + int skipNow = Math.Min(count, _bytesToSkip); + _bytesToSkip -= skipNow; + offset += skipNow; + count -= skipNow; + } + + if (count > 0) + { + _target.AddSamples(buffer, offset, count); + } + } + + public override bool CanRead => false; + public override bool CanSeek => true; + public override bool CanWrite => true; + public override long Length => _position; + public override long Position { get => _position; set => _position = value; } + + public override long Seek(long offset, SeekOrigin origin) => _position; // Dummy seek + public override void SetLength(long value) { } + public override void Flush() { } + public override int Read(byte[] buffer, int offset, int count) => 0; } - public override void CancelFromSource(TextSource source) + protected override void OnSoundCancelled() { - base.CancelFromSource(source); - this.streamSoundQueue.CancelFromSource(source); - } + // 1. Flag the current item to stop the inference loop + GetCurrentItem()?.Cancel(); + _speechSynthesizer.SpeakAsyncCancelAll(); - protected override void OnSoundCancelled() + // 2. Hard Stop the WASAPI hardware session immediately + StopHardware(); + } + + public override void CancelAllSounds() { - try + // Check if disposed before accessing the synthesizer + if (_isDisposed) return; + + try { - this.speechSynthesizer.SetOutputToNull(); + _speechSynthesizer?.SpeakAsyncCancelAll(); } + catch (ObjectDisposedException) { /* Already gone, safe to ignore */ } - catch (ObjectDisposedException) - { + StopHardware(); + // Call base after local cancellation logic + base.CancelAllSounds(); + } + + private void StopHardware() + { + lock (this.soundLock) + { + if (this.soundOut != null) + { + this.soundOut.Stop(); + Thread.Sleep(10); + this.soundOut.Dispose(); + this.soundOut = null; + } + if (this.bufferedProvider != null) + { + this.bufferedProvider.ClearBuffer(); + this.bufferedProvider = null; + } } } protected override void Dispose(bool disposing) { + if (_isDisposed) return; + _isDisposed = true; // Signal all loops to stop immediately + if (disposing) { - this.speechSynthesizer.Dispose(); - } + try + { + // Stop hardware first to release the audio device + soundOut?.Stop(); + + // Abort the synthesizer BEFORE calling base.Dispose + _speechSynthesizer?.SpeakAsyncCancelAll(); + _speechSynthesizer?.SetOutputToNull(); - base.Dispose(disposing); + // Give the background thread a very short window to exit gracefully + // rather than joining it indefinitely. + } + catch (Exception ex) + { + DetailedLog.Error(ex, "Error during early shutdown phase"); + } + + base.Dispose(disposing); // Clean up the queue thread + + _speechSynthesizer?.Dispose(); + soundOut?.Dispose(); + } } } } \ No newline at end of file diff --git a/src/TextToTalk/Backends/System/SystemSoundQueueItem.cs b/src/TextToTalk/Backends/System/SystemSoundQueueItem.cs index ea52962d..a95c8b19 100644 --- a/src/TextToTalk/Backends/System/SystemSoundQueueItem.cs +++ b/src/TextToTalk/Backends/System/SystemSoundQueueItem.cs @@ -5,5 +5,12 @@ public class SystemSoundQueueItem : SoundQueueItem public VoicePreset Preset { get; set; } public string Text { get; set; } + + public bool Aborted { get; private set; } + + internal void Cancel() + { + Aborted = true; + } } } \ No newline at end of file diff --git a/src/TextToTalk/Backends/Uberduck/UberduckBackend.cs b/src/TextToTalk/Backends/Uberduck/UberduckBackend.cs index 6cdda83f..dc306f4c 100644 --- a/src/TextToTalk/Backends/Uberduck/UberduckBackend.cs +++ b/src/TextToTalk/Backends/Uberduck/UberduckBackend.cs @@ -11,7 +11,7 @@ namespace TextToTalk.Backends.Uberduck; /// public class UberduckBackend : VoiceBackend { - private readonly StreamSoundQueue soundQueue; + private readonly StreamingSoundQueue soundQueue; private readonly UberduckBackendUI ui; private readonly UberduckClient? uberduck; @@ -19,10 +19,10 @@ public UberduckBackend(PluginConfiguration config, HttpClient http) { TitleBarColor = ImGui.ColorConvertU32ToFloat4(0xFFDE7312); - this.soundQueue = new StreamSoundQueue(config); + this.soundQueue = new StreamingSoundQueue(config); this.uberduck = new UberduckClient(this.soundQueue, http); - var voices = this.uberduck.GetVoices().GetAwaiter().GetResult(); + var voices = this.uberduck.UpdateVoices().GetAwaiter().GetResult(); this.ui = new UberduckBackendUI(config, this.uberduck, () => voices, this); } diff --git a/src/TextToTalk/Backends/Uberduck/UberduckBackendUI.cs b/src/TextToTalk/Backends/Uberduck/UberduckBackendUI.cs index 0bb5ab35..45805f84 100644 --- a/src/TextToTalk/Backends/Uberduck/UberduckBackendUI.cs +++ b/src/TextToTalk/Backends/Uberduck/UberduckBackendUI.cs @@ -29,8 +29,7 @@ public UberduckBackendUI(PluginConfiguration config, UberduckClient uberduck, var credentials = UberduckCredentialManager.LoadCredentials(); if (credentials != null) { - this.apiKey = credentials.UserName; - this.apiSecret = credentials.Password; + this.apiKey = credentials.Password; } this.uberduck.ApiKey = this.apiKey; @@ -52,11 +51,11 @@ public void DrawSettings(IConfigUIDelegates helpers) if (ImGui.Button($"Save and Login##{MemoizedId.Create()}")) { - var username = Whitespace.Replace(this.apiKey, ""); - var password = Whitespace.Replace(this.apiSecret, ""); - UberduckCredentialManager.SaveCredentials(username, password); - this.uberduck.ApiKey = username; - this.uberduck.ApiSecret = password; + var apiKey = Whitespace.Replace(this.apiKey, ""); + //var password = Whitespace.Replace(this.apiSecret, ""); + UberduckCredentialManager.SaveCredentials(apiKey); + this.uberduck.ApiKey = apiKey; + //this.uberduck.ApiSecret = password; } ImGui.SameLine(); diff --git a/src/TextToTalk/Backends/Uberduck/UberduckClient.cs b/src/TextToTalk/Backends/Uberduck/UberduckClient.cs index 23d2fb04..33f52f61 100644 --- a/src/TextToTalk/Backends/Uberduck/UberduckClient.cs +++ b/src/TextToTalk/Backends/Uberduck/UberduckClient.cs @@ -1,109 +1,90 @@ -using Newtonsoft.Json; +using Dalamud.Interface.Windowing; +using Newtonsoft.Json; +using Serilog; using System; using System.Collections.Generic; using System.Collections.Immutable; using System.IO; using System.Linq; -using System.Net; using System.Net.Http; using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; +using WindowsSystem = System.Net; namespace TextToTalk.Backends.Uberduck; public partial class UberduckClient { - private const string UrlBase = "https://api.uberduck.ai"; + private const string UrlBase = "https://api.uberduck.ai/v1"; private readonly HttpClient http; - private readonly StreamSoundQueue soundQueue; + private readonly StreamingSoundQueue soundQueue; public string? ApiKey { private get; set; } public string? ApiSecret { private get; set; } + public IDictionary> CachedVoices { get; private set; } + = new Dictionary>(); + private IList Voices { get; set; } - public UberduckClient(StreamSoundQueue soundQueue, HttpClient http) + + public UberduckClient(StreamingSoundQueue soundQueue, HttpClient http) { this.http = http; this.soundQueue = soundQueue; + var credentials = UberduckCredentialManager.LoadCredentials(); + if (credentials != null) + { + this.ApiKey = credentials.Password; ; + } Voices = new List(); } - public async Task>> GetVoices() - { - if (Voices.Count == 0) await UpdateVoices(); - return Voices - .GroupBy(v => string.IsNullOrWhiteSpace(v.Category) ? "Uncategorized" : v.Category) - .ToImmutableSortedDictionary( - g => g.Key, - g => (IList)g.OrderByDescending(v => v.DisplayName).ToList()); - } - public async Task Say(string voice, int playbackRate, float volume, TextSource source, string text) { - if (!IsAuthorizationSet()) - { - throw new UberduckMissingCredentialsException("No Uberduck authorization keys have been configured."); - } + var url = "https://api.uberduck.ai/v1/text-to-speech"; - ArgumentException.ThrowIfNullOrEmpty(voice); - - var voiceModelUuid = IsUuid(voice) ? voice : await GetUuidForVoice(voice); - var args = new UberduckSpeechRequest + var payload = new { - Speech = text, - VoiceModelUuid = voiceModelUuid, + text = text, + voice = voice, + output_format = "wav" // CRITICAL: Forces the API to return a WAV file instead of MP3 }; - // Make the request - using var content = new StringContent(JsonConvert.SerializeObject(args), Encoding.UTF8, "application/json"); - var res = await SendRequest("/speak", reqContent: content); - var uuid = res?.Uuid; - if (uuid is null) - { - DetailedLog.Warn("Got null UUID from Uberduck"); - return; - } - - DetailedLog.Debug($"Got request UUID {uuid} from Uberduck"); + using var request = new HttpRequestMessage(HttpMethod.Post, url); + request.Headers.Authorization = new WindowsSystem.Http.Headers.AuthenticationHeaderValue("Bearer", ApiKey); + request.Content = new StringContent(JsonConvert.SerializeObject(payload), Encoding.UTF8, "application/json"); - // Poll for the TTS result - await Task.Delay(20); + var response = await this.http.SendAsync(request); - var path = ""; - do + if (response.IsSuccessStatusCode) { - try - { - var status = await GetSpeechStatus(uuid); - if (status is not { FailedAt: null }) - { - DetailedLog.Warn($"TTS request {uuid} failed for an unknown reason"); - return; - } - - path = status.Path; - } - catch (UberduckFailedException e) when (e.StatusCode is HttpStatusCode.NotFound) - { - // ignored - } + var json = await response.Content.ReadAsStringAsync(); + var result = JsonConvert.DeserializeObject(json); - await Task.Delay(100); - } while (string.IsNullOrEmpty(path)); + if (result?.AudioUrl != null) + { + // Download the actual audio data + var audioBytes = await this.http.GetByteArrayAsync(result.AudioUrl); - DetailedLog.Debug($"Got response for TTS request {uuid}"); + // Use a MemoryStream to hold the downloaded data + var waveStream = new MemoryStream(audioBytes); - // Copy the sound to a new buffer and enqueue it - var responseStream = await this.http.GetStreamAsync(new Uri(path)); - var waveStream = new MemoryStream(); - await responseStream.CopyToAsync(waveStream); - waveStream.Seek(0, SeekOrigin.Begin); + // Pass the stream to your queue. Ensure the consumer uses WaveFileReader + // to correctly handle the WAV container. + this.soundQueue.EnqueueSound(waveStream, source, volume, StreamFormat.Uberduck, null); + } + } + } - this.soundQueue.EnqueueSound(waveStream, source, StreamFormat.Wave, volume); + // 2026 Response Model + public class UberduckTtsResponse + { + [JsonProperty("audio_url")] + public string? AudioUrl { get; set; } } private Task GetSpeechStatus(string uuid) @@ -122,14 +103,47 @@ private static bool IsUuid(string voice) private async Task GetUuidForVoice(string voice) { if (Voices.Count == 0) await UpdateVoices(); - var voiceInfo = Voices.Single(v => v.Name == voice); + var voiceInfo = Voices.Single(v => v.Name == voice); return voiceInfo.VoiceModelUuid; } - private async Task UpdateVoices() + // 1. Change return type to Task>> + public async Task>> UpdateVoices() { - var voicesRes = await this.http.GetStringAsync(new Uri("https://api.uberduck.ai/voices?mode=tts-basic")); - Voices = JsonConvert.DeserializeObject>(voicesRes) ?? new List(); + Log.Information("Updating Voices..."); + if (IsAuthorizationSet()) + { + var request = new HttpRequestMessage(HttpMethod.Get, "https://api.uberduck.ai/v1/voices"); + AddAuthorization(request); + + var response = await this.http.SendAsync(request); + + if (response.IsSuccessStatusCode) + { + var json = await response.Content.ReadAsStringAsync(); + var result = JsonConvert.DeserializeObject(json); + + // Update local state if needed + this.Voices = result?.Voices ?? new List(); + + // Update the cached variable before returning + this.CachedVoices = this.Voices + .OrderBy(v => v.DisplayName) + .GroupBy(v => v.Category ?? "Uncategorized") + .ToDictionary( + g => g.Key, + g => (IList)g.ToList() + ); + return this.CachedVoices; + } + else + { + Log.Information($"Response = {response.StatusCode}"); + } + } + + // Return empty dictionary if authorization fails or request fails + return new Dictionary>(); } private async Task SendRequest(string endpoint, string query = "", @@ -151,7 +165,7 @@ private async Task UpdateVoices() var res = await this.http.SendAsync(req); var resContent = await res.Content.ReadAsStringAsync(); - if (res.StatusCode is HttpStatusCode.Unauthorized or HttpStatusCode.Forbidden) + if (res.StatusCode is WindowsSystem.HttpStatusCode.Unauthorized or WindowsSystem.HttpStatusCode.Forbidden) { var detail = GetRequestFailureDetail(resContent); throw new UberduckUnauthorizedException(detail); @@ -183,15 +197,16 @@ private static string GetRequestFailureDetail(string resContent) private void AddAuthorization(HttpRequestMessage req) { - // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/WWW-Authenticate#basic_authentication - var raw = Encoding.UTF8.GetBytes($"{ApiKey}:{ApiSecret}"); - var encodedAuth = Convert.ToBase64String(raw); - req.Headers.Add("Authorization", $"Basic {encodedAuth}"); + // 2026 standard uses Bearer token with the API Key + // Ensure your ApiKey is the "Public Key" or "API Key" from the Uberduck dashboard + req.Headers.Authorization = new global::System.Net.Http.Headers.AuthenticationHeaderValue("Bearer", ApiKey); } private bool IsAuthorizationSet() { - return ApiKey?.Length > 0 && ApiSecret?.Length > 0; + var resultbool = ApiKey?.Length > 0; + Log.Information($"Is Authorization Set? {resultbool}"); + return resultbool;// && ApiSecret?.Length > 0; } private class UberduckSpeechRequest diff --git a/src/TextToTalk/Backends/Uberduck/UberduckCredentialManager.cs b/src/TextToTalk/Backends/Uberduck/UberduckCredentialManager.cs index 23fb8e98..a274ecb1 100644 --- a/src/TextToTalk/Backends/Uberduck/UberduckCredentialManager.cs +++ b/src/TextToTalk/Backends/Uberduck/UberduckCredentialManager.cs @@ -13,9 +13,9 @@ public static class UberduckCredentialManager return credentials; } - public static void SaveCredentials(string username, string password) + public static void SaveCredentials(string apikey)//, string password) { - var credentials = new NetworkCredential(username, password); + var credentials = new NetworkCredential("null", apikey);//, password); CredentialManager.SaveCredentials(CredentialsTarget, credentials); } diff --git a/src/TextToTalk/Backends/Uberduck/UberduckVoice.cs b/src/TextToTalk/Backends/Uberduck/UberduckVoice.cs index 7ca7824f..b196b7aa 100644 --- a/src/TextToTalk/Backends/Uberduck/UberduckVoice.cs +++ b/src/TextToTalk/Backends/Uberduck/UberduckVoice.cs @@ -1,14 +1,33 @@ using Newtonsoft.Json; +using System.Collections.Generic; namespace TextToTalk.Backends.Uberduck; public class UberduckVoice { - [JsonProperty("display_name")] public required string DisplayName { get; init; } + [JsonProperty("display_name")] + public required string DisplayName { get; init; } - [JsonProperty("name")] public required string Name { get; init; } + [JsonProperty("name")] + public required string Name { get; init; } - [JsonProperty("voicemodel_uuid")] public required string VoiceModelUuid { get; init; } + [JsonProperty("voicemodel_uuid")] + public required string VoiceModelUuid { get; init; } - [JsonProperty("category")] public required string? Category { get; init; } + [JsonProperty("category")] + public string? Category { get; init; } + + // New for 2026: Useful for filtering by locale + [JsonProperty("language")] + public string? Language { get; init; } + + // New for 2026: Contains traits like "professional", "narrative", etc. + [JsonProperty("tags")] + public List Tags { get; init; } = new(); +} + +public class UberduckVoiceResponse +{ + [JsonProperty("voices")] + public IList Voices { get; set; } } \ No newline at end of file diff --git a/src/TextToTalk/PluginConfiguration.cs b/src/TextToTalk/PluginConfiguration.cs index 7f1088ef..0aa55a16 100644 --- a/src/TextToTalk/PluginConfiguration.cs +++ b/src/TextToTalk/PluginConfiguration.cs @@ -8,6 +8,7 @@ using System.IO; using System.Linq; using System.Net; +using System.Threading; using TextToTalk.Backends; using TextToTalk.Backends.System; using TextToTalk.Backends.Websocket; diff --git a/src/TextToTalk/TextToTalk.csproj b/src/TextToTalk/TextToTalk.csproj index 6258ee03..f0d7b4ed 100644 --- a/src/TextToTalk/TextToTalk.csproj +++ b/src/TextToTalk/TextToTalk.csproj @@ -22,6 +22,11 @@ true + + + + + Always @@ -31,11 +36,12 @@ - + + compile; build; native; contentfiles; analyzers; buildtransitive diff --git a/src/TextToTalk/VoicePresetConfiguration.cs b/src/TextToTalk/VoicePresetConfiguration.cs index f116042e..cea5a588 100644 --- a/src/TextToTalk/VoicePresetConfiguration.cs +++ b/src/TextToTalk/VoicePresetConfiguration.cs @@ -254,8 +254,6 @@ private static VoicePreset RepairPreset(IDictionary corrupted) Id = Convert.ToInt32(GetNullableValue(corrupted, "Id")), Name = GetNullableValue(corrupted, "Name"), Gender = GetNullableValue(corrupted, "Gender"), - SampleRate = Convert.ToInt32(GetNullableValue(corrupted, "SampleRate")), - Pitch = Convert.ToSingle(GetNullableValue(corrupted, "Pitch")), PlaybackRate = Convert.ToSingle(GetNullableValue(corrupted, "PlaybackRate")), Volume = Convert.ToSingle(GetNullableValue(corrupted, "Volume")), Locale = GetNullableValue(corrupted, "Locale"), diff --git a/src/TextToTalk/packages.lock.json b/src/TextToTalk/packages.lock.json index 0ca8f093..b2523558 100644 --- a/src/TextToTalk/packages.lock.json +++ b/src/TextToTalk/packages.lock.json @@ -31,12 +31,12 @@ }, "Google.Cloud.TextToSpeech.V1": { "type": "Direct", - "requested": "[3.9.0, )", - "resolved": "3.9.0", - "contentHash": "JpejhPzzEQ6rdaf0nsjjJwj1CJb8Zs0x+TH27+A17KF2g0NqrgtAbpkUZTiGlQHhOzJSF1lB3amrQhbGjozJ3A==", + "requested": "[3.17.0, )", + "resolved": "3.17.0", + "contentHash": "27vM1NEBmCqAwqagwS0aEHfRBrFy7z6Ef+BblwKMaxtUUY0amdUdeXLY/PU8RSIHtJoan1K6ZKIS6YYqzgp77g==", "dependencies": { - "Google.Api.Gax.Grpc": "[4.9.0, 5.0.0)", - "Google.LongRunning": "[3.3.0, 4.0.0)" + "Google.Api.Gax.Grpc": "[4.12.1, 5.0.0)", + "Google.LongRunning": "[3.5.0, 4.0.0)" } }, "KokoroSharp.CPU": { @@ -77,6 +77,15 @@ "NAudio.WinMM": "2.2.1" } }, + "OpenAI": { + "type": "Direct", + "requested": "[2.8.0, )", + "resolved": "2.8.0", + "contentHash": "KcYpZ9IhuxFD2hGAJlL5vABtkr00CjeJU0SY8CjZQyzvzkzLop8jhdX3iDvteVJg6e3y4TEiY+Kti4gDJAagnA==", + "dependencies": { + "System.ClientModel": "1.8.1" + } + }, "R3": { "type": "Direct", "requested": "[1.2.9, )", @@ -135,103 +144,103 @@ }, "Google.Api.CommonProtos": { "type": "Transitive", - "resolved": "2.16.0", - "contentHash": "37MuZrE9AAqHAdYgFLoTHydAiXDRriQZGVKEg6fr6ASnrY5GtauYXnQrGk5x2K3NmYzEXe+wkpaPVmxjb3NKjg==", + "resolved": "2.17.0", + "contentHash": "elfQPknFr495hm7vdy6ZlgyQh6yzZq9TU7sS35L/Fj/fqjM/mUGau9gVJLhvQEtUlPjtR80hpn/m9HvBMyCXIw==", "dependencies": { - "Google.Protobuf": "[3.28.2, 4.0.0)" + "Google.Protobuf": "[3.31.1, 4.0.0]" } }, "Google.Api.Gax": { "type": "Transitive", - "resolved": "4.9.0", - "contentHash": "fjHHYcQ99u0ztqwT537rvVtJMdDy6G2VHBZ+F1cBjDGYNVZfrpk40DMQ/OpUGToT9ZGHVirhh3eJ73bw2ANVPQ==", + "resolved": "4.12.1", + "contentHash": "G62dRNOv5DolfRviT6CCrL2a5nZ/CWWdRzhADkGnpCkYSOc3QnH5xxRvZiOKuHU8weJ/pAqAqrj7+T9IWdlu2Q==", "dependencies": { "Microsoft.Bcl.AsyncInterfaces": "6.0.0", - "Newtonsoft.Json": "13.0.3" + "Newtonsoft.Json": "13.0.4" } }, "Google.Api.Gax.Grpc": { "type": "Transitive", - "resolved": "4.9.0", - "contentHash": "ToCx/0cs+wJ9j7vzKRcPAKneJVZrz/s9JhW9QsFx1dar9WzTxawQZ8xTjyieSy8tY0UiYCL1qYkn/iRrklYnSA==", + "resolved": "4.12.1", + "contentHash": "W3LjuitOWxWyvbwqeHvpgp0LdshEiTnw/pneDAfAhQ02VgU2gVEzSXfGNPsvL8hDPBXjngR/fWNme8Kungwwkw==", "dependencies": { - "Google.Api.CommonProtos": "2.16.0", - "Google.Api.Gax": "4.9.0", - "Google.Apis.Auth": "1.68.0", - "Grpc.Auth": "2.66.0", - "Grpc.Core.Api": "2.66.0", - "Grpc.Net.Client": "2.66.0", + "Google.Api.CommonProtos": "2.17.0", + "Google.Api.Gax": "4.12.1", + "Google.Apis.Auth": "1.72.0", + "Grpc.Auth": "[2.71.0, 3.0.0)", + "Grpc.Core.Api": "[2.71.0, 3.0.0)", + "Grpc.Net.Client": "[2.71.0, 3.0.0)", "Microsoft.Extensions.DependencyInjection.Abstractions": "6.0.0" } }, "Google.Apis": { "type": "Transitive", - "resolved": "1.68.0", - "contentHash": "s2MymhdpH+ybZNBeZ2J5uFgFHApBp+QXf9FjZSdM1lk/vx5VqIknJwnaWiuAzXxPrLEkesX0Q+UsiWn39yZ9zw==", + "resolved": "1.72.0", + "contentHash": "QbSJ08W7QuqsfzDPOZDHl1aFzCYwMcfBoHqQRh7koglwDN5WacShCKYMpU/zR1Pf3h3sH6JTGEeM/txAxaJuEg==", "dependencies": { - "Google.Apis.Core": "1.68.0" + "Google.Apis.Core": "1.72.0" } }, "Google.Apis.Auth": { "type": "Transitive", - "resolved": "1.68.0", - "contentHash": "hFx8Qz5bZ4w0hpnn4tSmZaaFpjAMsgVElZ+ZgVLUZ2r9i+AKcoVgwiNfv1pruNS5cCvpXqhKECbruBCfRezPHA==", + "resolved": "1.72.0", + "contentHash": "RBoFwFKBHKUjuyJf2weEnqICQLaY0TdIrdFv2yC8bsiR2VFYxizOn3C/qN1FWCCb0Uh9GhW+zwAV1yUxPjiocw==", "dependencies": { - "Google.Apis": "1.68.0", - "Google.Apis.Core": "1.68.0", + "Google.Apis": "1.72.0", + "Google.Apis.Core": "1.72.0", "System.Management": "7.0.2" } }, "Google.Apis.Core": { "type": "Transitive", - "resolved": "1.68.0", - "contentHash": "pAqwa6pfu53UXCR2b7A/PAPXeuVg6L1OFw38WckN27NU2+mf+KTjoEg2YGv/f0UyKxzz7DxF1urOTKg/6dTP9g==", + "resolved": "1.72.0", + "contentHash": "ZmYX1PU0vTKFT42c7gp4zaYcb/0TFAXrt9qw8yEz0wjvaug85+/WddlPTfT525Qei8iIUsF6t4bHYrsb2O7crg==", "dependencies": { - "Newtonsoft.Json": "13.0.3" + "Newtonsoft.Json": "13.0.4" } }, "Google.LongRunning": { "type": "Transitive", - "resolved": "3.3.0", - "contentHash": "F2SZ83Jo466Wj/s1Z7QhIAmWBXxJZQyXZpcx0P8BR7d6s0FAj67vQjeUPESSJcvsy8AqYiYBhkUr2YpZhTQeHg==", + "resolved": "3.5.0", + "contentHash": "W8xO6FA+rG8WjKOsyIjTKjeKLcyCrjBBYeEdZ4QBkKQcxmRczbrfKhKQmdorb2V35CqXeeTbue5Na6Zkgyv8ow==", "dependencies": { - "Google.Api.Gax.Grpc": "[4.8.0, 5.0.0)" + "Google.Api.Gax.Grpc": "[4.12.1, 5.0.0)" } }, "Google.Protobuf": { "type": "Transitive", - "resolved": "3.28.2", - "contentHash": "Z86ZKAB+v1B/m0LTM+EVamvZlYw/g3VND3/Gs4M/+aDIxa2JE9YPKjDxTpf0gv2sh26hrve3eI03brxBmzn92g==" + "resolved": "3.31.1", + "contentHash": "gSnJbUmGiOTdWddPhqzrEscHq9Ls6sqRDPB9WptckyjTUyx70JOOAaDLkFff8gManZNN3hllQ4aQInnQyq/Z/A==" }, "Grpc.Auth": { "type": "Transitive", - "resolved": "2.66.0", - "contentHash": "FRQlhMAcHf0GjAXIfhN6RydfZncLLXNNTOtpLL1bt57kp59vu40faW+dr6Vwl7ef/IUFfF38aiB5jvhAA/9Aow==", + "resolved": "2.71.0", + "contentHash": "t2aGh/pMgqmc3GimtYfC7VcgVY/VSbk6SLH+61wewsgK45tzxxD9nYYItT5bpLn7fbebirmHXfgJcVKIArd0cg==", "dependencies": { - "Google.Apis.Auth": "1.68.0", - "Grpc.Core.Api": "2.66.0" + "Google.Apis.Auth": "1.69.0", + "Grpc.Core.Api": "2.71.0" } }, "Grpc.Core.Api": { "type": "Transitive", - "resolved": "2.66.0", - "contentHash": "HsjsQVAHe4hqP4t4rpUnmq+MZvPdyrlPsWF4T5fbMvyP3o/lMV+KVJfDlaNH8+v0aGQTVT3EsDFufbhaWb52cw==" + "resolved": "2.71.0", + "contentHash": "QquqUC37yxsDzd1QaDRsH2+uuznWPTS8CVE2Yzwl3CvU4geTNkolQXoVN812M2IwT6zpv3jsZRc9ExJFNFslTg==" }, "Grpc.Net.Client": { "type": "Transitive", - "resolved": "2.66.0", - "contentHash": "GwkSsssXFgN9+M2U+UQWdErf61sn1iqgP+2NRBlDXATcP9vlxda0wySxd/eIL8U522+SnyFNUXlvQ5tAzGk9cA==", + "resolved": "2.71.0", + "contentHash": "U1vr20r5ngoT9nlb7wejF28EKN+taMhJsV9XtK9MkiepTZwnKxxiarriiMfCHuDAfPUm9XUjFMn/RIuJ4YY61w==", "dependencies": { - "Grpc.Net.Common": "2.66.0", + "Grpc.Net.Common": "2.71.0", "Microsoft.Extensions.Logging.Abstractions": "6.0.0" } }, "Grpc.Net.Common": { "type": "Transitive", - "resolved": "2.66.0", - "contentHash": "YJpQpIvpo0HKlsG6SHwaieyji08qfv0DdEDIewCAA0egQY08637sHOj1netLGUhzBEsCqlGC3e92TZ2uqhxnvw==", + "resolved": "2.71.0", + "contentHash": "v0c8R97TwRYwNXlC8GyRXwYTCNufpDfUtj9la+wUrZFzVWkFJuNAltU+c0yI3zu0jl54k7en6u2WKgZgd57r2Q==", "dependencies": { - "Grpc.Core.Api": "2.66.0" + "Grpc.Core.Api": "2.71.0" } }, "KokoroSharp": { @@ -258,13 +267,16 @@ }, "Microsoft.Extensions.DependencyInjection.Abstractions": { "type": "Transitive", - "resolved": "6.0.0", - "contentHash": "xlzi2IYREJH3/m6+lUrQlujzX8wDitm4QGnUu6kUXTQAWPuZY8i+ticFJbzfqaetLA6KR/rO6Ew/HuYD+bxifg==" + "resolved": "8.0.2", + "contentHash": "3iE7UF7MQkCv1cxzCahz+Y/guQbTqieyxyaWKhrRO91itI9cOKO76OHeQDahqG4MmW5umr3CcCvGmK92lWNlbg==" }, "Microsoft.Extensions.Logging.Abstractions": { "type": "Transitive", - "resolved": "6.0.0", - "contentHash": "/HggWBbTwy8TgebGSX5DBZ24ndhzi93sHUBDvP1IxbZD7FDokYzdAr6+vbWGjw2XAfR2EJ1sfKUotpjHnFWPxA==" + "resolved": "8.0.3", + "contentHash": "dL0QGToTxggRLMYY4ZYX5AMwBb+byQBd/5dMiZE07Nv73o6I5Are3C7eQTh7K2+A4ct0PVISSr7TZANbiNb2yQ==", + "dependencies": { + "Microsoft.Extensions.DependencyInjection.Abstractions": "8.0.2" + } }, "Microsoft.ML.OnnxRuntime": { "type": "Transitive", @@ -331,8 +343,8 @@ }, "Newtonsoft.Json": { "type": "Transitive", - "resolved": "13.0.3", - "contentHash": "HrC5BXdl00IP9zeV+0Z848QWPAoCr9P3bDEZguI+gkLcBKAOxix/tLEAAHC+UvDNPv4a2d18lOReHMOagPa+zQ==" + "resolved": "13.0.4", + "contentHash": "pdgNNMai3zv51W5aq268sujXUyx7SNdE2bj1wZcWjAQrKMFZV260lbqYop1d2GM67JI1huLRwxo9ZqnfF/lC6A==" }, "NumSharp": { "type": "Transitive", @@ -358,6 +370,15 @@ "resolved": "5.0.0-pre.13", "contentHash": "65qbZS49AfrTM6jtZ2RDTWAzLe13ywCXIiSP5QrAJLmZT6sQqHGd1LfFXLhx8Ccp77qy7qh/LHsxpUOlkgZTCg==" }, + "System.ClientModel": { + "type": "Transitive", + "resolved": "1.8.1", + "contentHash": "4oUQgw/vaO4FBOk3YsH40hbrjxRED1l95rRLvTMtHXfQxapXya9IfPpm/KgwValFFtYTfYGFOs/qzGmGyexicQ==", + "dependencies": { + "Microsoft.Extensions.Logging.Abstractions": "8.0.3", + "System.Memory.Data": "8.0.1" + } + }, "System.CodeDom": { "type": "Transitive", "resolved": "7.0.0", @@ -371,6 +392,11 @@ "System.CodeDom": "7.0.0" } }, + "System.Memory.Data": { + "type": "Transitive", + "resolved": "8.0.1", + "contentHash": "BVYuec3jV23EMRDeR7Dr1/qhx7369dZzJ9IWy2xylvb4YfXsrUxspWc4UWYid/tj4zZK58uGZqn2WQiaDMhmAg==" + }, "System.Numerics.Tensors": { "type": "Transitive", "resolved": "9.0.5", From 62ef46c511e9c5ee2202ff446c9c0ce67e83a7d3 Mon Sep 17 00:00:00 2001 From: mitcheb0219 Date: Mon, 12 Jan 2026 21:39:56 -0500 Subject: [PATCH 02/13] Backend Changes for Azure, OpenAI Added Logging to measure time to first audio --- src/TextToTalk/Backends/Azure/AzureClient.cs | 84 ++++++++++++------- .../Backends/ElevenLabs/ElevenLabsClient.cs | 9 +- .../Backends/GoogleCloud/GoogleCloudClient.cs | 7 +- .../Backends/Kokoro/KokoroBackend.cs | 15 ++-- .../Backends/Kokoro/KokoroSoundQueue.cs | 22 +++-- .../Backends/OpenAI/OpenAiBackend.cs | 3 +- .../Backends/OpenAI/OpenAiClient.cs | 55 ++++++------ src/TextToTalk/Backends/Polly/PollyClient.cs | 8 +- .../Backends/StreamSoundQueueItem.cs | 3 + .../Backends/StreamingSoundQueue.cs | 35 +++++--- .../Backends/System/SystemBackend.cs | 5 +- .../Backends/System/SystemSoundQueue.cs | 10 ++- .../Backends/System/SystemSoundQueueItem.cs | 2 + .../Backends/Uberduck/UberduckClient.cs | 6 +- 14 files changed, 170 insertions(+), 94 deletions(-) diff --git a/src/TextToTalk/Backends/Azure/AzureClient.cs b/src/TextToTalk/Backends/Azure/AzureClient.cs index cbb933be..d0fd168a 100644 --- a/src/TextToTalk/Backends/Azure/AzureClient.cs +++ b/src/TextToTalk/Backends/Azure/AzureClient.cs @@ -3,30 +3,44 @@ using Serilog; using System; using System.Collections.Generic; +using System.Diagnostics; using System.IO; using System.Linq; using System.Threading; using System.Threading.Tasks; +using System.Net.Http; using TextToTalk.Lexicons; namespace TextToTalk.Backends.Azure; public class AzureClient : IDisposable { + private readonly HttpClient _httpClient; + private readonly string _apiKey; + private readonly string _endpoint; + private readonly SpeechConfig speechConfig; private readonly SpeechSynthesizer synthesizer; + private readonly StreamingSoundQueue soundQueue; - private readonly LexiconManager lexiconManager; + private readonly LexiconManager _lexiconManager; private readonly PluginConfiguration config; private CancellationTokenSource? _ttsCts; public AzureClient(string subscriptionKey, string region, LexiconManager lexiconManager, PluginConfiguration config) { - var audioConfig = AudioConfig.FromWavFileOutput("NUL"); - this.speechConfig = SpeechConfig.FromSubscription(subscriptionKey, region); - this.synthesizer = new SpeechSynthesizer(speechConfig, audioConfig); - this.soundQueue = new StreamingSoundQueue(config); - this.lexiconManager = lexiconManager; + _apiKey = subscriptionKey; + _endpoint = $"https://{region}.tts.speech.microsoft.com/cognitiveservices/v1"; + + _httpClient = new HttpClient(); + _httpClient.DefaultRequestHeaders.Add("Ocp-Apim-Subscription-Key", _apiKey); + _httpClient.DefaultRequestHeaders.Add("User-Agent", "TextToTalkApp"); + + soundQueue = new StreamingSoundQueue(config); + _lexiconManager = lexiconManager; + speechConfig = SpeechConfig.FromSubscription(subscriptionKey, region); + speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm); + synthesizer = new SpeechSynthesizer(speechConfig, null); } public TextSource GetCurrentlySpokenTextSource() @@ -64,38 +78,46 @@ public List GetVoices() public async Task Say(string? voice, int playbackRate, float volume, TextSource source, string text, string style) { + long methodStart = Stopwatch.GetTimestamp(); _ttsCts?.Cancel(); _ttsCts = new CancellationTokenSource(); var token = _ttsCts.Token; - var ssml = this.lexiconManager.MakeSsml( - text, - style, - voice: voice, - langCode: "en-US", - playbackRate: playbackRate, - includeSpeakAttributes: true); - - // LOW LATENCY PATH: Start speaking and stream chunks immediately - speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm); - using var result = await this.synthesizer.StartSpeakingSsmlAsync(ssml); - using var audioDataStream = AudioDataStream.FromResult(result); - - byte[] buffer = new byte[4096]; - uint bytesRead; - while ((bytesRead = audioDataStream.ReadData(buffer)) > 0) - { - if (token.IsCancellationRequested) break; - // Create a copy of the buffer for the specific chunk - var chunk = new byte[bytesRead]; - Buffer.BlockCopy(buffer, 0, chunk, 0, (int)bytesRead); + + var ssml = _lexiconManager.MakeSsml(text, style, voice, "en-US", playbackRate, true); + + using var request = new HttpRequestMessage(HttpMethod.Post, _endpoint) + { + Content = new StringContent(ssml, global::System.Text.Encoding.UTF8, "application/ssml+xml") + }; + + // 2026 Low Latency Format: 'raw' is better for direct streaming than 'riff' + request.Headers.Add("X-Microsoft-OutputFormat", "raw-16khz-16bit-mono-pcm"); + + using var response = await _httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, token); + response.EnsureSuccessStatusCode(); + + using var responseStream = await response.Content.ReadAsStreamAsync(token); + + byte[] buffer = new byte[4096]; + int bytesRead; + + while ((bytesRead = await responseStream.ReadAsync(buffer, 0, buffer.Length, token)) > 0) + { + if (token.IsCancellationRequested) break; + + var chunk = new byte[bytesRead]; + Buffer.BlockCopy(buffer, 0, chunk, 0, bytesRead); + + var chunkStream = new MemoryStream(chunk); + long? timestampToPass = methodStart; + soundQueue.EnqueueSound(chunkStream, source, volume, StreamFormat.Azure, null, timestampToPass); - var chunkStream = new MemoryStream(chunk); - this.soundQueue.EnqueueSound(chunkStream, source, volume, StreamFormat.Azure, null); - } - } + // Implicitly returns Task.CompletedTask because it is 'async Task' + } + public Task CancelAllSounds() { diff --git a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsClient.cs b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsClient.cs index e65b25a3..aae4eae1 100644 --- a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsClient.cs +++ b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsClient.cs @@ -3,6 +3,7 @@ using System; using System.Collections.Generic; using System.Collections.Immutable; +using System.Diagnostics; using System.IO; using System.Linq; using System.Net; @@ -34,7 +35,7 @@ public ElevenLabsClient(StreamingSoundQueue soundQueue, HttpClient http) public async Task Say(string? voice, int playbackRate, float volume, float similarityBoost, float stability, TextSource source, string text, string? model, string? style) { - Log.Information($"Style = {style}"); + long methodStart = Stopwatch.GetTimestamp(); _TtsCts?.Cancel(); _TtsCts?.Dispose(); @@ -70,8 +71,6 @@ public async Task Say(string? voice, int playbackRate, float volume, float simil Stability = finalStability, }, }; - Log.Information($"Model Called = {args.ModelId}"); - Log.Information($"Message Sent = {args.Text}"); var uriBuilder = new UriBuilder(UrlBase) { Path = $"/v1/text-to-speech/{voice}/stream" }; @@ -89,10 +88,10 @@ public async Task Say(string? voice, int playbackRate, float volume, float simil // Get the stream directly from the response var responseStream = await res.Content.ReadAsStreamAsync(ct); - + long? timestampToPass = methodStart; // Enqueue the live stream. // IMPORTANT: Your soundQueue must be able to process the stream as bytes arrive. - this.soundQueue.EnqueueSound(responseStream, source, volume, StreamFormat.Mp3, res); + this.soundQueue.EnqueueSound(responseStream, source, volume, StreamFormat.Mp3, res, timestampToPass); } catch (OperationCanceledException) { diff --git a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs index d683bc68..4d44b695 100644 --- a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs +++ b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs @@ -1,6 +1,7 @@ using Google.Cloud.TextToSpeech.V1; using System; using System.Collections.Generic; +using System.Diagnostics; using System.IO; using System.Linq; using System.Threading; @@ -77,6 +78,7 @@ public List ExtractUniqueLocales(List? voicesList) public async Task Say(string? locale, string? voice, float? speed, float volume, TextSource source, string text) { + long methodStart = Stopwatch.GetTimestamp(); if (client == null || soundQueue == null || locale == null) return; if (_TtsCts != null) @@ -125,7 +127,7 @@ await streamingCall.WriteAsync(new StreamingSynthesizeRequest await streamingCall.WriteCompleteAsync(); // 4. Process the response stream with the cancellation token - // Use WithCancellation to properly dispose of the enumerator on cancel + await foreach (var response in streamingCall.GetResponseStream().WithCancellation(ct)) { if (response.AudioContent.Length > 0) @@ -134,7 +136,8 @@ await streamingCall.WriteAsync(new StreamingSynthesizeRequest // Note: Linear16 audio is typically handled as StreamFormat.Pcm // but matches Wave if your queue expects raw headerless bytes. - soundQueue.EnqueueSound(chunkStream, source, volume, StreamFormat.Wave, null); + long? timestampToPass = methodStart; + soundQueue.EnqueueSound(chunkStream, source, volume, StreamFormat.Wave, null, timestampToPass); } } } diff --git a/src/TextToTalk/Backends/Kokoro/KokoroBackend.cs b/src/TextToTalk/Backends/Kokoro/KokoroBackend.cs index 6c29df93..ab1796e8 100644 --- a/src/TextToTalk/Backends/Kokoro/KokoroBackend.cs +++ b/src/TextToTalk/Backends/Kokoro/KokoroBackend.cs @@ -1,15 +1,16 @@ +using Dalamud.Bindings.ImGui; +using Dalamud.Game; +using KokoroSharp; +using KokoroSharp.Core; +using KokoroSharp.Processing; using System; +using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.IO; using System.Net.Http; using System.Security.Cryptography; using System.Threading; using System.Threading.Tasks; -using Dalamud.Game; -using Dalamud.Bindings.ImGui; -using KokoroSharp; -using KokoroSharp.Core; -using KokoroSharp.Processing; namespace TextToTalk.Backends.Kokoro; @@ -105,6 +106,8 @@ public override void Say(SayRequest request) public void Say(string text, KokoroVoicePreset voicePreset, TextSource source, ClientLanguage language) { + long methodStart = Stopwatch.GetTimestamp(); + long? timestampToPass = methodStart; if (!TryGetModel(out _)) { return; @@ -120,7 +123,7 @@ public void Say(string text, KokoroVoicePreset voicePreset, TextSource source, C } // TODO: apply lexicon once KokoroSharp supports it - soundQueue.EnqueueSound(new(text, voice, voicePreset.Speed ?? 1f, voicePreset.Volume ?? 0.6f, source, language)); + soundQueue.EnqueueSound(new(text, voice, voicePreset.Speed ?? 1f, voicePreset.Volume ?? 0.6f, source, language, timestampToPass)); } public override void CancelAllSpeech() diff --git a/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs b/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs index be68f757..f150d2d8 100644 --- a/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs +++ b/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs @@ -1,14 +1,16 @@ -using NAudio.CoreAudioApi; +using Dalamud.Game; +using KokoroSharp; +using KokoroSharp.Core; +using KokoroSharp.Processing; +using NAudio.CoreAudioApi; using NAudio.Wave; +using Serilog; using System; +using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.IO; using System.Threading; using System.Threading.Tasks; -using Dalamud.Game; -using KokoroSharp; -using KokoroSharp.Core; -using KokoroSharp.Processing; namespace TextToTalk.Backends.Kokoro; @@ -92,6 +94,11 @@ protected override void OnSoundLoop(KokoroSourceQueueItem nextItem) this.bufferedProvider.AddSamples(bytes, 0, bytes.Length); if (this.soundOut.PlaybackState != PlaybackState.Playing) { + if (nextItem.StartTime.HasValue) + { + var elapsed = Stopwatch.GetElapsedTime(nextItem.StartTime.Value); + Log.Information("Total Latency (Say -> Play): {Ms}ms", elapsed.TotalMilliseconds); + } this.soundOut.Play(); } } @@ -167,7 +174,7 @@ public void EnqueueSound(KokoroSourceQueueItem item) public class KokoroSourceQueueItem : SoundQueueItem { - public KokoroSourceQueueItem(string text, KokoroVoice voice, float speed, float volume, TextSource source, ClientLanguage language) + public KokoroSourceQueueItem(string text, KokoroVoice voice, float speed, float volume, TextSource source, ClientLanguage language, long? startTime) { Source = source; Text = text; @@ -176,6 +183,7 @@ public KokoroSourceQueueItem(string text, KokoroVoice voice, float speed, float Volume = volume; Source = source; Language = language; + StartTime = startTime; } public string Text { get; } @@ -185,6 +193,8 @@ public KokoroSourceQueueItem(string text, KokoroVoice voice, float speed, float public bool Aborted { get; private set; } public ClientLanguage Language { get; } + public long? StartTime { get; set; } // Use GetTimestamp() value + internal void Cancel() { Aborted = true; diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs b/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs index 1ea5a366..84ea7c62 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs @@ -37,8 +37,7 @@ public override void Say(SayRequest request) { try { - Log.Information($"Voice Style = {voicePreset.Style}"); - await this.uiModel.OpenAi.Say(request.Text, voicePreset.Model, voicePreset.VoiceName, !string.IsNullOrWhiteSpace(request.Style) ? request.Style : (voicePreset.Style ?? string.Empty), 1.0f, voicePreset.Volume); + await this.uiModel.OpenAi.Say(request.Text, voicePreset.Model, request.Source, voicePreset.VoiceName, !string.IsNullOrWhiteSpace(request.Style) ? request.Style : (voicePreset.Style ?? string.Empty), 1.0f, voicePreset.Volume); } catch (OpenAiUnauthorizedException e) { diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiClient.cs b/src/TextToTalk/Backends/OpenAI/OpenAiClient.cs index cfbb357e..c11b763a 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiClient.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiClient.cs @@ -1,13 +1,17 @@ -using NAudio.CoreAudioApi; +using Dalamud.Bindings.ImGui; +using NAudio.CoreAudioApi; using OpenAI; using Serilog; using System; using System.ClientModel; using System.ClientModel.Primitives; using System.Collections.Generic; +using System.Diagnostics; using System.IO; using System.Linq; using System.Net.Http; +using System.Net.Http.Headers; +using System.Text; using System.Text.Json; using System.Threading; using System.Threading.Tasks; @@ -21,6 +25,8 @@ public class OpenAiClient private readonly StreamingSoundQueue _soundQueue; public CancellationTokenSource? _ttsCts; + private readonly HttpClient _httpClient = new(); + // --- Provided Definitions --- public record ModelConfig( string ModelName, @@ -66,58 +72,57 @@ public OpenAiClient(StreamingSoundQueue soundQueue, string apiKey) } } - public async Task Say(string text, string modelName, string voiceId, string? instructions, float speed, float volume) + public async Task Say(string text, string modelName, TextSource source, string voiceId, string? instructions, float speed, float volume) { - if (_openAiClient == null) return; + long methodStart = Stopwatch.GetTimestamp(); + if (string.IsNullOrWhiteSpace(ApiKey)) return; - // Cancel any previous request before starting a new one _ttsCts?.Cancel(); _ttsCts = new CancellationTokenSource(); var token = _ttsCts.Token; try { - OpenAIAudio.AudioClient audioClient = _openAiClient.GetAudioClient(modelName); - + // 1. Prepare the JSON Payload var requestBody = new Dictionary { { "model", modelName }, { "input", text }, { "voice", voiceId.ToLowerInvariant() }, - { "response_format", "mp3" }, + { "response_format", "pcm" }, { "speed", speed } }; - if (Models.First(m => m.ModelName == "gpt-4o-mini-tts").InstructionsSupported) + // Check if model supports instructions (gpt-4o-mini-tts) + var modelCfg = Models.FirstOrDefault(m => m.ModelName == modelName); + if (modelCfg != null && modelCfg.InstructionsSupported && !string.IsNullOrEmpty(instructions)) { - requestBody["instructions"] = instructions ?? ""; + requestBody["instructions"] = instructions; } - BinaryContent content = BinaryContent.Create(BinaryData.FromObjectAsJson(requestBody)); - RequestOptions options = new(); - options.BufferResponse = false; - - // PASS THE TOKEN HERE - options.CancellationToken = token; - - // The request will throw OperationCanceledException if cancelled during the call - ClientResult result = await audioClient.GenerateSpeechAsync(content, options); + // 2. Configure the Request + using var request = new HttpRequestMessage(HttpMethod.Post, "https://api.openai.com/v1/audio/speech"); + request.Headers.Authorization = new AuthenticationHeaderValue("Bearer", ApiKey); + request.Content = new StringContent(JsonSerializer.Serialize(requestBody), Encoding.UTF8, "application/json"); - Stream liveAudioStream = result.GetRawResponse().ContentStream; + // 3. Send and Stream Response + // HttpCompletionOption.ResponseHeadersRead is the "magic" for low latency + var response = await _httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, token); + response.EnsureSuccessStatusCode(); - // Register a callback to close the stream if cancellation happens while reading - token.Register(() => liveAudioStream.Close()); + var responseStream = await response.Content.ReadAsStreamAsync(token); - Log.Information("Queuing Sound"); - _soundQueue.EnqueueSound(liveAudioStream, TextSource.None, volume, StreamFormat.Mp3, null); + // 4. Pass the live stream directly to the sound queue + // The queue will handle the background reading/decoding + _soundQueue.EnqueueSound(responseStream, source, volume, StreamFormat.Wave, null, methodStart); } catch (OperationCanceledException) { - Log.Information("OpenAI Speech generation was cancelled by the user."); + Log.Information("OpenAI Speech generation was cancelled."); } catch (Exception ex) { - Log.Error(ex, "OpenAI Streaming Speech generation failed."); + Log.Error(ex, "OpenAI REST Speech generation failed."); } } } diff --git a/src/TextToTalk/Backends/Polly/PollyClient.cs b/src/TextToTalk/Backends/Polly/PollyClient.cs index 1202e8f5..eb607107 100644 --- a/src/TextToTalk/Backends/Polly/PollyClient.cs +++ b/src/TextToTalk/Backends/Polly/PollyClient.cs @@ -5,6 +5,7 @@ using Serilog; using System; using System.Collections.Generic; +using System.Diagnostics; using System.IO; using System.Threading; using System.Threading.Tasks; @@ -57,6 +58,7 @@ public TextSource GetCurrentlySpokenTextSource() public async Task Say(Engine engine, VoiceId voice, string? amazonDomainName, int sampleRate, int playbackRate, float volume, TextSource source, string text) { + long methodStart = Stopwatch.GetTimestamp(); _TtsCts?.Cancel(); _TtsCts?.Dispose(); @@ -80,7 +82,7 @@ public async Task Say(Engine engine, VoiceId voice, string? amazonDomainName, in SampleRate = sampleRate.ToString(), TextType = TextType.Ssml, }; - + bool isFirstChunk = true; try { // Using 'using' ensures the response (and its stream) is disposed after the queue handles it @@ -88,7 +90,9 @@ public async Task Say(Engine engine, VoiceId voice, string? amazonDomainName, in // Pass the live AudioStream directly to the queue. // Ensure EnqueueSound is updated to process the stream as it arrives. - this.soundQueue.EnqueueSound(res.AudioStream, source, volume, StreamFormat.Mp3, null); + long? timestampToPass = isFirstChunk ? methodStart : null; + this.soundQueue.EnqueueSound(res.AudioStream, source, volume, StreamFormat.Mp3, null, timestampToPass); + isFirstChunk = false; } catch (OperationCanceledException) { diff --git a/src/TextToTalk/Backends/StreamSoundQueueItem.cs b/src/TextToTalk/Backends/StreamSoundQueueItem.cs index a9b2c96d..089c9494 100644 --- a/src/TextToTalk/Backends/StreamSoundQueueItem.cs +++ b/src/TextToTalk/Backends/StreamSoundQueueItem.cs @@ -1,5 +1,6 @@ using System.IO; using System.Net.Http; +using System.Diagnostics; using static TextToTalk.Backends.System.SystemSoundQueue; namespace TextToTalk.Backends @@ -37,6 +38,8 @@ public class StreamingSoundQueueItem : SoundQueueItem public HttpResponseMessage? Response { get; set; } public bool Aborted { get; set; } + public long? StartTime { get; set; } // Use GetTimestamp() value + protected override void Dispose(bool disposing) { if (disposing) diff --git a/src/TextToTalk/Backends/StreamingSoundQueue.cs b/src/TextToTalk/Backends/StreamingSoundQueue.cs index 31996af1..2c1a2538 100644 --- a/src/TextToTalk/Backends/StreamingSoundQueue.cs +++ b/src/TextToTalk/Backends/StreamingSoundQueue.cs @@ -1,9 +1,11 @@ -using NAudio.CoreAudioApi; +using Google.Protobuf.WellKnownTypes; +using NAudio.CoreAudioApi; using NAudio.Wave; using NAudio.Wave.SampleProviders; using Serilog; using System; using System.Collections.Generic; +using System.Diagnostics; using System.IO; using System.Net.Http; using System.Net.Sockets; @@ -34,7 +36,7 @@ public class StreamingSoundQueue(PluginConfiguration config) : SoundQueue 4096 && this.soundOut.PlaybackState != PlaybackState.Playing) { + if (nextItem.StartTime.HasValue) + { + var elapsed = Stopwatch.GetElapsedTime(nextItem.StartTime.Value); + Log.Information("Total Latency (Say -> Play): {Ms}ms", elapsed.TotalMilliseconds); + } this.soundOut.Play(); } } @@ -137,12 +144,11 @@ private void ProcessMp3Stream(StreamingSoundQueueItem nextItem) private void ProcessRawPcmStream(StreamingSoundQueueItem nextItem) { - Log.Information("Playing as raw PCM"); - // Resolve format for raw PCM types + // Resolve format WaveFormat chunkFormat = nextItem.Format switch { StreamFormat.Wave => Wave, - StreamFormat.Azure => Azure, + StreamFormat.Azure => Azure, // Ensure this is 24000Hz for OpenAI PCM! _ => throw new NotSupportedException($"Format {nextItem.Format} requires a decompressor."), }; @@ -150,18 +156,27 @@ private void ProcessRawPcmStream(StreamingSoundQueueItem nextItem) { EnsureHardwareInitialized(chunkFormat); - // Read in chunks to avoid calling .Length on network streams byte[] chunkBuffer = new byte[16384]; int bytesRead; + bool latencyLogged = false; // Flag to ensure we only log once + while ((bytesRead = nextItem.Data.Read(chunkBuffer, 0, chunkBuffer.Length)) > 0) { ApplyVolumeToPcmBuffer(chunkBuffer, bytesRead, nextItem.Volume); - this.bufferedProvider.AddSamples(chunkBuffer, 0, bytesRead); - if (this.bufferedProvider.BufferedBytes > 512 && this.soundOut.PlaybackState != PlaybackState.Playing) + // Condition: Buffer has enough "cushion" to prevent the blip + if (this.bufferedProvider.BufferedBytes > 16384 && this.soundOut.PlaybackState != PlaybackState.Playing) { this.soundOut.Play(); + + // Log latency exactly once when the sound actually starts + if (!latencyLogged && nextItem.StartTime.HasValue) + { + var elapsed = Stopwatch.GetElapsedTime(nextItem.StartTime.Value); + Log.Information("Total Latency (Say -> Play): {Ms}ms", elapsed.TotalMilliseconds); + latencyLogged = true; + } } } } diff --git a/src/TextToTalk/Backends/System/SystemBackend.cs b/src/TextToTalk/Backends/System/SystemBackend.cs index d37accce..00887ecb 100644 --- a/src/TextToTalk/Backends/System/SystemBackend.cs +++ b/src/TextToTalk/Backends/System/SystemBackend.cs @@ -1,4 +1,5 @@ using System; +using System.Diagnostics; using System.Net.Http; using System.Threading; @@ -30,7 +31,9 @@ public override void DrawStyles(IConfigUIDelegates helpers) } public override void Say(SayRequest request) { - this.soundQueue.EnqueueSound(request.Voice, request.Source, request.Text); + long methodStart = Stopwatch.GetTimestamp(); + long? timestampToPass = methodStart; + this.soundQueue.EnqueueSound(request.Voice, request.Source, request.Text, timestampToPass); } public override void CancelAllSpeech() diff --git a/src/TextToTalk/Backends/System/SystemSoundQueue.cs b/src/TextToTalk/Backends/System/SystemSoundQueue.cs index 3650b741..a72921a1 100644 --- a/src/TextToTalk/Backends/System/SystemSoundQueue.cs +++ b/src/TextToTalk/Backends/System/SystemSoundQueue.cs @@ -5,6 +5,7 @@ using Serilog; using System; using System.Collections.Generic; +using System.Diagnostics; using System.IO; using System.Speech.Synthesis; using System.Threading; @@ -45,13 +46,14 @@ private SpeechSynthesizer GetSynthesizerForVoice(string voiceName) } return synth; } - public void EnqueueSound(VoicePreset preset, TextSource source, string text) + public void EnqueueSound(VoicePreset preset, TextSource source, string text, long? timeStamp) { AddQueueItem(new SystemSoundQueueItem { Preset = preset, Source = source, Text = text, + StartTime = timeStamp, }); } @@ -137,8 +139,12 @@ protected override void OnSoundLoop(SystemSoundQueueItem nextItem) { if (this.soundOut?.PlaybackState != PlaybackState.Playing) { + if (nextItem.StartTime.HasValue) + { + var elapsed = Stopwatch.GetElapsedTime(nextItem.StartTime.Value); + Log.Information("Total Latency (Say -> Play): {Ms}ms", elapsed.TotalMilliseconds); + } this.soundOut?.Play(); - Log.Information("Playing"); } } } diff --git a/src/TextToTalk/Backends/System/SystemSoundQueueItem.cs b/src/TextToTalk/Backends/System/SystemSoundQueueItem.cs index a95c8b19..34bf0eda 100644 --- a/src/TextToTalk/Backends/System/SystemSoundQueueItem.cs +++ b/src/TextToTalk/Backends/System/SystemSoundQueueItem.cs @@ -8,6 +8,8 @@ public class SystemSoundQueueItem : SoundQueueItem public bool Aborted { get; private set; } + public long? StartTime { get; set; } // Use GetTimestamp() value + internal void Cancel() { Aborted = true; diff --git a/src/TextToTalk/Backends/Uberduck/UberduckClient.cs b/src/TextToTalk/Backends/Uberduck/UberduckClient.cs index 33f52f61..9d387850 100644 --- a/src/TextToTalk/Backends/Uberduck/UberduckClient.cs +++ b/src/TextToTalk/Backends/Uberduck/UberduckClient.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; using System.Collections.Immutable; +using System.Diagnostics; using System.IO; using System.Linq; using System.Net.Http; @@ -45,6 +46,7 @@ public UberduckClient(StreamingSoundQueue soundQueue, HttpClient http) public async Task Say(string voice, int playbackRate, float volume, TextSource source, string text) { + long methodStart = Stopwatch.GetTimestamp(); var url = "https://api.uberduck.ai/v1/text-to-speech"; var payload = new @@ -72,10 +74,10 @@ public async Task Say(string voice, int playbackRate, float volume, TextSource s // Use a MemoryStream to hold the downloaded data var waveStream = new MemoryStream(audioBytes); - + long? timestampToPass = methodStart; // Pass the stream to your queue. Ensure the consumer uses WaveFileReader // to correctly handle the WAV container. - this.soundQueue.EnqueueSound(waveStream, source, volume, StreamFormat.Uberduck, null); + this.soundQueue.EnqueueSound(waveStream, source, volume, StreamFormat.Uberduck, null, timestampToPass); } } } From 31b39d294608f7044fcea93eba5a56d8530c64fb Mon Sep 17 00:00:00 2001 From: mitcheb0219 Date: Tue, 13 Jan 2026 22:49:03 -0500 Subject: [PATCH 03/13] Added Piper Backend --- src/TextToTalk.Tests/packages.lock.json | 25 ++ .../Backends/Kokoro/KokoroBackend.cs | 4 - src/TextToTalk/Backends/Piper/PiperBackend.cs | 214 ++++++++++++++++++ .../Backends/Piper/PiperBackendUI.cs | 132 +++++++++++ .../Backends/Piper/PiperSoundQueueItem.cs | 29 +++ .../Backends/Piper/PiperVoicePreset.cs | 26 +++ src/TextToTalk/Backends/StreamFormat.cs | 1 + .../Backends/StreamingSoundQueue.cs | 43 +++- src/TextToTalk/Backends/TTSBackend.cs | 5 + .../Backends/VoiceBackendManager.cs | 2 + .../GameEnums/AdditionalChatType.cs | 2 +- src/TextToTalk/TextToTalk.cs | 2 + src/TextToTalk/TextToTalk.csproj | 1 + src/TextToTalk/VoicePresetConfiguration.cs | 16 +- src/TextToTalk/packages.lock.json | 25 ++ 15 files changed, 510 insertions(+), 17 deletions(-) create mode 100644 src/TextToTalk/Backends/Piper/PiperBackend.cs create mode 100644 src/TextToTalk/Backends/Piper/PiperBackendUI.cs create mode 100644 src/TextToTalk/Backends/Piper/PiperSoundQueueItem.cs create mode 100644 src/TextToTalk/Backends/Piper/PiperVoicePreset.cs diff --git a/src/TextToTalk.Tests/packages.lock.json b/src/TextToTalk.Tests/packages.lock.json index c0a0507f..4c576c91 100644 --- a/src/TextToTalk.Tests/packages.lock.json +++ b/src/TextToTalk.Tests/packages.lock.json @@ -372,11 +372,30 @@ "resolved": "5.0.0-pre.13", "contentHash": "65qbZS49AfrTM6jtZ2RDTWAzLe13ywCXIiSP5QrAJLmZT6sQqHGd1LfFXLhx8Ccp77qy7qh/LHsxpUOlkgZTCg==" }, + "PiperSharp": { + "type": "Transitive", + "resolved": "1.0.6", + "contentHash": "g68TbampKc0ATx80nur6LHHrhIpXvmioIVuwAuWKcjTXTB2tf+Klk4JPwzWZRo+DRSR4kS370eh+davEQVR0cw==", + "dependencies": { + "NAudio": "2.2.1", + "NAudio.Core": "2.2.1", + "Newtonsoft.Json": "13.0.1", + "SharpCompress": "0.36.0" + } + }, "R3": { "type": "Transitive", "resolved": "1.2.9", "contentHash": "dKMFt90XW+n7JK2P40dx9uuLg57Pcj4cA/9n1NwdKWFcMAM6j49OU8h9EborpVe4KXI+2MV/EjKc1LG7fhQJUA==" }, + "SharpCompress": { + "type": "Transitive", + "resolved": "0.36.0", + "contentHash": "48am//T6Ou+GmyPmBaxaFN1ym0VNidRcBeANr9+OYTzpKRz8QMGzAkHVkCV30lFQ/gnWqGr50AuebahpG1C6xA==", + "dependencies": { + "ZstdSharp.Port": "0.7.4" + } + }, "Standart.Hash.xxHash": { "type": "Transitive", "resolved": "4.0.5", @@ -477,6 +496,11 @@ "resolved": "15.3.0", "contentHash": "F93japYa9YrJ59AZGhgdaUGHN7ITJ55FBBg/D/8C0BDgahv/rQD6MOSwHxOJJpon1kYyslVbeBrQ2wcJhox01w==" }, + "ZstdSharp.Port": { + "type": "Transitive", + "resolved": "0.7.4", + "contentHash": "ziptnotpUJr51afwXJQ5Wc03dvDiZAdmxS08s1g7SHn/VzbyZUXdH6yORk/zaNjzUOEE6pVZ0Nqztab0rYROgQ==" + }, "texttotalk": { "type": "Project", "dependencies": { @@ -488,6 +512,7 @@ "Microsoft.CognitiveServices.Speech": "[1.41.1, )", "NAudio": "[2.2.1, )", "OpenAI": "[2.8.0, )", + "PiperSharp": "[1.0.6, )", "R3": "[1.2.9, )", "Standart.Hash.xxHash": "[4.0.5, )", "System.Drawing.Common": "[9.0.0, )", diff --git a/src/TextToTalk/Backends/Kokoro/KokoroBackend.cs b/src/TextToTalk/Backends/Kokoro/KokoroBackend.cs index ab1796e8..c146e18b 100644 --- a/src/TextToTalk/Backends/Kokoro/KokoroBackend.cs +++ b/src/TextToTalk/Backends/Kokoro/KokoroBackend.cs @@ -165,10 +165,6 @@ protected override void Dispose(bool disposing) cts.Cancel(); if (disposing) { - if (TryGetModel(out var model)) - { - model.Dispose(); - } soundQueue.Dispose(); } } diff --git a/src/TextToTalk/Backends/Piper/PiperBackend.cs b/src/TextToTalk/Backends/Piper/PiperBackend.cs new file mode 100644 index 00000000..ad7e1250 --- /dev/null +++ b/src/TextToTalk/Backends/Piper/PiperBackend.cs @@ -0,0 +1,214 @@ +using Dalamud.Bindings.ImGui; +using Dalamud.Game; +using PiperSharp; +using PiperSharp.Models; +using Serilog; +using System; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Linq; +using System.Net.Http; +using System.Threading; +using System.Threading.Tasks; +using TextToTalk.Backends.Kokoro; +using TextToTalk.Backends.Piper; + +namespace TextToTalk.Backends.Piper; + +public class PiperBackend : VoiceBackend +{ + private readonly PiperProvider piper; + private readonly PiperBackendUI ui; + private readonly StreamingSoundQueue soundQueue; + private readonly Task modelTask; + private readonly CancellationTokenSource cts = new(); + + private Process? piperServerProcess; + + private string GetVoicesDir(PluginConfiguration config) => + Path.Combine(config.GetPluginConfigDirectory(), "piper", "voices"); + + public PiperBackend(PluginConfiguration config) + { + ui = new PiperBackendUI(config, this); + string piperExe = Path.Join(config.GetPluginConfigDirectory(), "piper", "piper.exe"); + + piper = new PiperProvider(new PiperConfiguration() + { + ExecutableLocation = piperExe, + WorkingDirectory = Path.GetDirectoryName(piperExe) + }); + + modelTask = LoadOrDownloadModelAsync(config); + soundQueue = new StreamingSoundQueue(config); + } + public static bool IsModelFileDownloaded(PluginConfiguration config) + { + var piperExePath = Path.Combine(config.GetPluginConfigDirectory(), "piper", "piper.exe"); + return File.Exists(piperExePath); + } + + /// Downloads the Piper executable and initial voice models. + public async Task EnsurePiperAssetsDownloaded(PluginConfiguration config) + { + string configDir = config.GetPluginConfigDirectory(); + string voicesDir = GetVoicesDir(config); + var allModels = await PiperDownloader.GetHuggingFaceModelList(); + + var filteredModels = allModels + .Where(m => m.Key.StartsWith("en") && m.Key.EndsWith("medium")) + .ToList(); + + foreach (var modelEntry in filteredModels) + { + string modelKey = modelEntry.Key; + string modelTargetDir = Path.Combine(voicesDir, modelKey); + + if (File.Exists(Path.Combine(modelTargetDir, $"{modelKey}.onnx"))) continue; + + try + { + DetailedLog.Info($"Downloading English medium voice: {modelKey}"); + await modelEntry.Value.DownloadModel(voicesDir); + + string onnxPath = Path.Combine(modelTargetDir, $"{modelKey}.onnx"); + await LoadSpecificVoiceModel(onnxPath); + } + catch (Exception ex) + { + DetailedLog.Error($"Failed to download {modelKey}: {ex.Message}"); + } + } + } + + private async Task LoadOrDownloadModelAsync(PluginConfiguration config) + { + + string modelKey = "en_US-lessac-medium"; + string modelDir = Path.Combine(GetVoicesDir(config), modelKey); + string onnxFilePath = Path.Combine(modelDir, $"{modelKey}.onnx"); + + if (!File.Exists(onnxFilePath)) + { + await EnsurePiperAssetsDownloaded(config); + } + + return await LoadSpecificVoiceModel(onnxFilePath); + } + + private async Task LoadSpecificVoiceModel(string onnxFilePath) + { + string modelDir = Path.GetDirectoryName(onnxFilePath); + string configFilePath = onnxFilePath + ".json"; + string piperSharpExpectedJson = Path.Combine(modelDir, "model.json"); + + if (File.Exists(configFilePath) && !File.Exists(piperSharpExpectedJson)) + { + File.Copy(configFilePath, piperSharpExpectedJson, true); + } + + return await VoiceModel.LoadModel(modelDir); + } + + private bool TryGetModel([NotNullWhen(true)] out VoiceModel? tts) + { + if (modelTask.IsCompletedSuccessfully) + { + tts = modelTask.Result; + return true; + } + + tts = null; + return false; + } + + public override void Say(SayRequest request) + { + if (request.Voice is not PiperVoicePreset voicePreset) + throw new InvalidOperationException("Invalid voice preset."); + + if (!modelTask.IsCompletedSuccessfully) return; + + Say(request.Text, voicePreset, request.Source); + } + + public async Task Say(string text, PiperVoicePreset voicePreset, TextSource source) + { + long methodStart = Stopwatch.GetTimestamp(); + long? timestampToPass = methodStart; + + if (string.IsNullOrEmpty(voicePreset.ModelPath) || !File.Exists(voicePreset.ModelPath)) + { + DetailedLog.Error($"Piper model file not found: {voicePreset.ModelPath}"); + return; + } + + try + { + var voiceDir = Path.GetDirectoryName(voicePreset.ModelPath); + var voiceModel = await VoiceModel.LoadModel(voiceDir); + + piper.Configuration.Model = voiceModel; + piper.Configuration.SpeakingRate = 1.0f / voicePreset.Speed ?? 1f; + + byte[] audioData = await piper.InferAsync(text, AudioOutputType.Raw, cts.Token); + if (audioData == null || audioData.Length == 0) return; + var audioStream = new MemoryStream(audioData); + soundQueue.EnqueueSound(audioStream, source, voicePreset.Volume ?? 1f, StreamFormat.Piper, null, timestampToPass); + } + catch (Exception ex) + { + DetailedLog.Error($"Piper switching/inference failed: {ex.Message}"); + } + } + + public override void CancelAllSpeech() + { + soundQueue.CancelAllSounds(); + } + + public override void CancelSay(TextSource source) + { + soundQueue.CancelFromSource(source); + } + + public override void DrawSettings(IConfigUIDelegates helpers) + { + if (TryGetModel(out _)) + { + ui.DrawVoicePresetOptions(); + return; + } + + if (modelTask.Status == TaskStatus.Faulted) + { + ImGui.TextColored(ImColor.Red, $"Failed to download model: {modelTask.Exception?.Message}"); + DetailedLog.Error($"Failed to download Piper model: {modelTask.Exception}"); + } + else + { + ImGui.TextColored(ImColor.HintColor, "Model is still downloading or initializing..."); + } + } + + public override TextSource GetCurrentlySpokenTextSource() + { + return soundQueue.GetCurrentlySpokenTextSource(); + } + public override void DrawStyles(IConfigUIDelegates helpers) + { + helpers.OpenVoiceStylesConfig(); + } + protected override void Dispose(bool disposing) + { + cts.Cancel(); + + if (disposing) + { + soundQueue?.Dispose(); + cts.Dispose(); + } + } + +} \ No newline at end of file diff --git a/src/TextToTalk/Backends/Piper/PiperBackendUI.cs b/src/TextToTalk/Backends/Piper/PiperBackendUI.cs new file mode 100644 index 00000000..cfb0cdb3 --- /dev/null +++ b/src/TextToTalk/Backends/Piper/PiperBackendUI.cs @@ -0,0 +1,132 @@ +using Dalamud.Bindings.ImGui; +using Dalamud.Utility; +using System; +using System.IO; +using System.Linq; +using TextToTalk.UI; + +namespace TextToTalk.Backends.Piper; + +public class PiperBackendUI(PluginConfiguration config, PiperBackend piperBackend) +{ + private string[] availableModelPaths; + private string[] modelDisplayNames; + private int selectedModelIndex = -1; + + public void DrawVoicePresetOptions() + { + ImGui.TextColored(ImColor.HintColor, "Piper is a local neural TTS engine. Ensure you have downloaded models."); + + ImGui.Spacing(); + + var currentVoicePreset = config.GetCurrentVoicePreset(); + var presets = config.GetVoicePresetsForBackend(TTSBackend.Piper).ToList(); + + if (presets.Count > 0 && currentVoicePreset != null) + { + var presetIndex = currentVoicePreset is not null ? presets.IndexOf(currentVoicePreset) : -1; + if (ImGui.Combo($"Voice preset##{MemoizedId.Create()}", ref presetIndex, + presets.Select(p => p.Name).ToArray(), presets.Count)) + { + config.SetCurrentVoicePreset(presets[presetIndex].Id); + config.Save(); + selectedModelIndex = -1; + } + } + else if (currentVoicePreset != null) + { + ImGui.TextColored(ImColor.Red, "You have no presets. Create one to begin."); + } + else if (currentVoicePreset == null && presets.Count > 0) + { + config.SetCurrentVoicePreset(presets.First().Id); + } + + BackendUI.NewPresetButton($"New preset##{MemoizedId.Create()}", config); + + if (presets.Count == 0 || currentVoicePreset is null) return; + + ImGui.SameLine(); + BackendUI.DeletePresetButton($"Delete preset##{MemoizedId.Create()}", currentVoicePreset, TTSBackend.Piper, config); + + ImGui.Separator(); + + var presetName = currentVoicePreset.Name; + if (ImGui.InputText($"Preset name##{MemoizedId.Create()}", ref presetName, 64)) + { + currentVoicePreset.Name = presetName; + config.Save(); + } + + // --- Model Selection --- + var piperDir = Path.Combine(config.GetPluginConfigDirectory(), "piper"); + var voicesDir = Path.Combine(piperDir, "voices"); + + if (!Directory.Exists(voicesDir)) + Directory.CreateDirectory(voicesDir); + + availableModelPaths = Directory.GetFiles(voicesDir, "*.onnx", SearchOption.AllDirectories); + modelDisplayNames = availableModelPaths.Select(Path.GetFileName).ToArray(); + + if (availableModelPaths.Length > 0) + { + if (selectedModelIndex == -1 || + selectedModelIndex >= availableModelPaths.Length || + availableModelPaths[selectedModelIndex] != currentVoicePreset.ModelPath) + { + selectedModelIndex = Array.IndexOf(availableModelPaths, currentVoicePreset.ModelPath ?? ""); + if (selectedModelIndex == -1) selectedModelIndex = 0; + } + + if (ImGui.Combo($"Voice Model (.onnx)##{MemoizedId.Create()}", ref selectedModelIndex, modelDisplayNames, modelDisplayNames.Length)) + { + currentVoicePreset.ModelPath = availableModelPaths[selectedModelIndex]; + currentVoicePreset.InternalName = modelDisplayNames[selectedModelIndex].Replace(".onnx", ""); + config.Save(); + } + } + else + { + ImGui.TextColored(ImColor.Red, $"No .onnx models found in subdirectories of: {voicesDir}"); + } + + if (ImGui.Button($"Download Models##{MemoizedId.Create()}")) + { + _ = piperBackend.EnsurePiperAssetsDownloaded(config); + } + Components.Tooltip("Will download all English voices at medium quality (appx 1.5GB)"); + + // --- Voice Parameters --- + var speed = currentVoicePreset.Speed ?? 1f; + if (ImGui.SliderFloat($"Speed##{MemoizedId.Create()}", ref speed, 0.5f, 3.0f, "%.2fx")) + { + currentVoicePreset.Speed = speed; + config.Save(); + } + + var volume = (int)((currentVoicePreset.Volume ?? 1.0f) * 100); + if (ImGui.SliderInt($"Volume##{MemoizedId.Create()}", ref volume, 0, 200, "%d%%")) + { + currentVoicePreset.Volume = MathF.Round((float)volume / 100, 2); + config.Save(); + } + + if (ImGui.Button($"Test##{MemoizedId.Create()}")) + { + if (!string.IsNullOrEmpty(currentVoicePreset.ModelPath) && File.Exists(currentVoicePreset.ModelPath)) + { + piperBackend.CancelSay(TextSource.Chat); + piperBackend.Say($"Hello from Piper neural engine. This is a test message", currentVoicePreset, + TextSource.Chat); + } + } + + ImGui.Separator(); + + ConfigComponents.ToggleUseGenderedVoicePresets($"Use gendered voices##{MemoizedId.Create()}", config); + if (config.UseGenderedVoicePresets) + { + BackendUI.GenderedPresetConfig("Piper", TTSBackend.Piper, config, presets); + } + } +} \ No newline at end of file diff --git a/src/TextToTalk/Backends/Piper/PiperSoundQueueItem.cs b/src/TextToTalk/Backends/Piper/PiperSoundQueueItem.cs new file mode 100644 index 00000000..02a9ead6 --- /dev/null +++ b/src/TextToTalk/Backends/Piper/PiperSoundQueueItem.cs @@ -0,0 +1,29 @@ +using Dalamud.Game; +using TextToTalk.Backends.Piper; + +namespace TextToTalk.Backends.Piper; + +public class PiperSoundQueueItem : SoundQueueItem +{ + public string Text { get; } + public PiperVoicePreset Voice { get; } + + public float Speed { get; } + public float Volume { get; } + public bool Aborted { get; private set; } + public ClientLanguage Language { get; } + + public long? StartTime { get; set; } + + public PiperSoundQueueItem(string text, PiperVoicePreset voice, TextSource source, ClientLanguage language, long? startTime) + { + Text = text; + Voice = voice; + Source = source; + Language = language; + StartTime = startTime; + + Speed = voice.Speed ?? 1.0f; + Volume = voice.Volume ?? 1.0f; + } +} \ No newline at end of file diff --git a/src/TextToTalk/Backends/Piper/PiperVoicePreset.cs b/src/TextToTalk/Backends/Piper/PiperVoicePreset.cs new file mode 100644 index 00000000..dccacb8e --- /dev/null +++ b/src/TextToTalk/Backends/Piper/PiperVoicePreset.cs @@ -0,0 +1,26 @@ +using Newtonsoft.Json; + +namespace TextToTalk.Backends.Piper; + +public class PiperVoicePreset : VoicePreset +{ + [JsonProperty("ModelName")] + public string? InternalName { get; set; } + + [JsonProperty("ModelPath")] + public string? ModelPath { get; set; } + + public float? Speed { get; set; } + + public float? Volume { get; set; } + + public override bool TrySetDefaultValues() + { + InternalName = "en_US-lessac-medium"; + ModelPath = ""; // To be populated by the file picker or downloader + Speed = 1.0f; + Volume = 1.0f; + EnabledBackend = TTSBackend.Piper; + return true; + } +} \ No newline at end of file diff --git a/src/TextToTalk/Backends/StreamFormat.cs b/src/TextToTalk/Backends/StreamFormat.cs index f6ff90a3..3404baf6 100644 --- a/src/TextToTalk/Backends/StreamFormat.cs +++ b/src/TextToTalk/Backends/StreamFormat.cs @@ -8,4 +8,5 @@ public enum StreamFormat Azure, System, Uberduck, + Piper, } \ No newline at end of file diff --git a/src/TextToTalk/Backends/StreamingSoundQueue.cs b/src/TextToTalk/Backends/StreamingSoundQueue.cs index 2c1a2538..3a24ecd3 100644 --- a/src/TextToTalk/Backends/StreamingSoundQueue.cs +++ b/src/TextToTalk/Backends/StreamingSoundQueue.cs @@ -148,38 +148,61 @@ private void ProcessRawPcmStream(StreamingSoundQueueItem nextItem) WaveFormat chunkFormat = nextItem.Format switch { StreamFormat.Wave => Wave, - StreamFormat.Azure => Azure, // Ensure this is 24000Hz for OpenAI PCM! + StreamFormat.Azure => Azure, + StreamFormat.Piper => Wave, _ => throw new NotSupportedException($"Format {nextItem.Format} requires a decompressor."), }; lock (this.soundLock) { EnsureHardwareInitialized(chunkFormat); - byte[] chunkBuffer = new byte[16384]; int bytesRead; - bool latencyLogged = false; // Flag to ensure we only log once + bool latencyLogged = false; + + while ((bytesRead = nextItem.Data.Read(chunkBuffer, 0, chunkBuffer.Length)) > 0) { ApplyVolumeToPcmBuffer(chunkBuffer, bytesRead, nextItem.Volume); this.bufferedProvider.AddSamples(chunkBuffer, 0, bytesRead); - // Condition: Buffer has enough "cushion" to prevent the blip + // 2. Start hardware if it's not already playing if (this.bufferedProvider.BufferedBytes > 16384 && this.soundOut.PlaybackState != PlaybackState.Playing) { - this.soundOut.Play(); - - // Log latency exactly once when the sound actually starts - if (!latencyLogged && nextItem.StartTime.HasValue) + // 1. Log latency immediately when we start processing the item + if (nextItem.StartTime.HasValue) { var elapsed = Stopwatch.GetElapsedTime(nextItem.StartTime.Value); - Log.Information("Total Latency (Say -> Play): {Ms}ms", elapsed.TotalMilliseconds); - latencyLogged = true; + Log.Information("Total Latency (Say -> Processing): {Ms}ms", elapsed.TotalMilliseconds); } + + this.soundOut.Play(); + latencyLogged = true; + } + } + + + // 3. WAIT AND STOP: Release the hardware lock once this item (and any previous) is finished + // This is critical to ensure the NEXT item sees the state as 'Not Playing' + // This is necessary because of some weirdness with the Piper backend. This code is not compatible with Azure + if (nextItem.Format == StreamFormat.Piper) + { + while (this.bufferedProvider.BufferedBytes > 0) + { + // Small sleep to prevent CPU spiking while waiting for hardware to finish the buffer + Thread.Sleep(10); + } + + if (this.soundOut.PlaybackState == PlaybackState.Playing) + { + this.soundOut.Stop(); // This resets the state to Stopped/NotPlaying + Log.Debug("Playback finished, hardware stopped and lock released."); } } } + + // 4. Dispose the stream after playback is complete nextItem.Data.Dispose(); } diff --git a/src/TextToTalk/Backends/TTSBackend.cs b/src/TextToTalk/Backends/TTSBackend.cs index e7cab46e..f9637299 100644 --- a/src/TextToTalk/Backends/TTSBackend.cs +++ b/src/TextToTalk/Backends/TTSBackend.cs @@ -1,5 +1,6 @@ using System; using TextToTalk.Backends.Kokoro; +using TextToTalk.Backends.Piper; namespace TextToTalk.Backends { @@ -14,6 +15,7 @@ public enum TTSBackend : long OpenAi, GoogleCloud, Kokoro, + Piper, } public static class TTSBackendExtensions @@ -32,6 +34,8 @@ public static string GetFormattedName(this TTSBackend backend, PluginConfigurati TTSBackend.GoogleCloud => "Google Cloud", TTSBackend.Kokoro when config != null && KokoroBackend.IsModelFileDownloaded(config) => "Kokoro", TTSBackend.Kokoro => "Kokoro (169MB download required)", + TTSBackend.Piper when config != null && PiperBackend.IsModelFileDownloaded(config) => "Piper", + TTSBackend.Piper => "Piper (download required)", _ => throw new ArgumentOutOfRangeException(nameof(backend)), }; } @@ -49,6 +53,7 @@ public static bool AreLexiconsEnabled(this TTSBackend backend) TTSBackend.OpenAi => false, TTSBackend.GoogleCloud => false, TTSBackend.Kokoro => false, + TTSBackend.Piper => false, _ => throw new ArgumentOutOfRangeException(nameof(backend)), }; } diff --git a/src/TextToTalk/Backends/VoiceBackendManager.cs b/src/TextToTalk/Backends/VoiceBackendManager.cs index e5392502..5c17c571 100644 --- a/src/TextToTalk/Backends/VoiceBackendManager.cs +++ b/src/TextToTalk/Backends/VoiceBackendManager.cs @@ -9,6 +9,7 @@ using TextToTalk.Backends.GoogleCloud; using TextToTalk.Backends.Kokoro; using TextToTalk.Backends.OpenAI; +using TextToTalk.Backends.Piper; using TextToTalk.Backends.Polly; using TextToTalk.Backends.System; using TextToTalk.Backends.Uberduck; @@ -113,6 +114,7 @@ private VoiceBackend CreateBackendFor(TTSBackend backendKind) TTSBackend.OpenAi => new OpenAiBackend(this.config, this.http, this.notificationService), TTSBackend.GoogleCloud => new GoogleCloudBackend(this.config), TTSBackend.Kokoro => new KokoroBackend(this.config), + TTSBackend.Piper => new PiperBackend(this.config), _ => throw new ArgumentOutOfRangeException(nameof(backendKind)), }; } diff --git a/src/TextToTalk/GameEnums/AdditionalChatType.cs b/src/TextToTalk/GameEnums/AdditionalChatType.cs index 76c992dc..12b806d8 100644 --- a/src/TextToTalk/GameEnums/AdditionalChatType.cs +++ b/src/TextToTalk/GameEnums/AdditionalChatType.cs @@ -10,7 +10,7 @@ public enum AdditionalChatType Gathering = 67, FCAnnouncement = 69, FCLogin = 70, - RetainerSale = 71, + //RetainerSale = 71, PartyFinderState = 72, ActionUsedOnYou = 2091, FailedActionUsedOnYou = 2218, diff --git a/src/TextToTalk/TextToTalk.cs b/src/TextToTalk/TextToTalk.cs index 6aca4860..fc444030 100644 --- a/src/TextToTalk/TextToTalk.cs +++ b/src/TextToTalk/TextToTalk.cs @@ -23,6 +23,7 @@ using TextToTalk.Backends.GoogleCloud; using TextToTalk.Backends.Kokoro; using TextToTalk.Backends.OpenAI; +using TextToTalk.Backends.Piper; using TextToTalk.Backends.Polly; using TextToTalk.Backends.System; using TextToTalk.Backends.Uberduck; @@ -507,6 +508,7 @@ private static unsafe bool TryGetCharacter(GameObject? speaker, OpenAiBackend => GetVoiceForSpeaker(name, gender), GoogleCloudBackend => GetVoiceForSpeaker(name, gender), KokoroBackend => GetVoiceForSpeaker(name, gender), + PiperBackend => GetVoiceForSpeaker(name, gender), _ => throw new InvalidOperationException("Failed to get voice preset for backend."), }; } diff --git a/src/TextToTalk/TextToTalk.csproj b/src/TextToTalk/TextToTalk.csproj index f0d7b4ed..c32a5669 100644 --- a/src/TextToTalk/TextToTalk.csproj +++ b/src/TextToTalk/TextToTalk.csproj @@ -42,6 +42,7 @@ + compile; build; native; contentfiles; analyzers; buildtransitive diff --git a/src/TextToTalk/VoicePresetConfiguration.cs b/src/TextToTalk/VoicePresetConfiguration.cs index cea5a588..827ebbf3 100644 --- a/src/TextToTalk/VoicePresetConfiguration.cs +++ b/src/TextToTalk/VoicePresetConfiguration.cs @@ -1,14 +1,15 @@ -using System; +using Newtonsoft.Json; +using System; using System.Collections.Generic; using System.IO; using System.Linq; -using Newtonsoft.Json; using TextToTalk.Backends; using TextToTalk.Backends.Azure; using TextToTalk.Backends.ElevenLabs; using TextToTalk.Backends.GoogleCloud; using TextToTalk.Backends.Kokoro; using TextToTalk.Backends.OpenAI; +using TextToTalk.Backends.Piper; using TextToTalk.Backends.Polly; using TextToTalk.Backends.System; using TextToTalk.Backends.Uberduck; @@ -269,6 +270,17 @@ private static VoicePreset RepairPreset(IDictionary corrupted) InternalName = GetNullableValue(corrupted, "InternalName"), EnabledBackend = TTSBackend.Kokoro }, + TTSBackend.Piper => new PiperVoicePreset + { + Id = Convert.ToInt32(GetNullableValue(corrupted, "Id")), + Speed = Convert.ToSingle(GetNullableValue(corrupted, "Speed")), + Volume = Convert.ToSingle(GetNullableValue(corrupted, "Volume")), + Name = GetNullableValue(corrupted, "Name"), + InternalName = GetNullableValue(corrupted, "InternalName"), + ModelPath = GetNullableValue(corrupted, "ModelPath"), + + EnabledBackend = TTSBackend.Piper + }, _ => throw new ArgumentOutOfRangeException($"{backendCorrupt}"), }; } diff --git a/src/TextToTalk/packages.lock.json b/src/TextToTalk/packages.lock.json index b2523558..a8398281 100644 --- a/src/TextToTalk/packages.lock.json +++ b/src/TextToTalk/packages.lock.json @@ -86,6 +86,18 @@ "System.ClientModel": "1.8.1" } }, + "PiperSharp": { + "type": "Direct", + "requested": "[1.0.6, )", + "resolved": "1.0.6", + "contentHash": "g68TbampKc0ATx80nur6LHHrhIpXvmioIVuwAuWKcjTXTB2tf+Klk4JPwzWZRo+DRSR4kS370eh+davEQVR0cw==", + "dependencies": { + "NAudio": "2.2.1", + "NAudio.Core": "2.2.1", + "Newtonsoft.Json": "13.0.1", + "SharpCompress": "0.36.0" + } + }, "R3": { "type": "Direct", "requested": "[1.2.9, )", @@ -370,6 +382,14 @@ "resolved": "5.0.0-pre.13", "contentHash": "65qbZS49AfrTM6jtZ2RDTWAzLe13ywCXIiSP5QrAJLmZT6sQqHGd1LfFXLhx8Ccp77qy7qh/LHsxpUOlkgZTCg==" }, + "SharpCompress": { + "type": "Transitive", + "resolved": "0.36.0", + "contentHash": "48am//T6Ou+GmyPmBaxaFN1ym0VNidRcBeANr9+OYTzpKRz8QMGzAkHVkCV30lFQ/gnWqGr50AuebahpG1C6xA==", + "dependencies": { + "ZstdSharp.Port": "0.7.4" + } + }, "System.ClientModel": { "type": "Transitive", "resolved": "1.8.1", @@ -407,6 +427,11 @@ "resolved": "15.3.0", "contentHash": "F93japYa9YrJ59AZGhgdaUGHN7ITJ55FBBg/D/8C0BDgahv/rQD6MOSwHxOJJpon1kYyslVbeBrQ2wcJhox01w==" }, + "ZstdSharp.Port": { + "type": "Transitive", + "resolved": "0.7.4", + "contentHash": "ziptnotpUJr51afwXJQ5Wc03dvDiZAdmxS08s1g7SHn/VzbyZUXdH6yORk/zaNjzUOEE6pVZ0Nqztab0rYROgQ==" + }, "texttotalk.data": { "type": "Project", "dependencies": { From d62fdf4d8573ab407432a21b407022a4e364cef0 Mon Sep 17 00:00:00 2001 From: mitcheb0219 Date: Wed, 14 Jan 2026 11:37:56 -0500 Subject: [PATCH 04/13] Cleaned up Piper UI. Streaming Pipe --- src/TextToTalk.Tests/packages.lock.json | 31 ++++- src/TextToTalk/Backends/Piper/PiperBackend.cs | 117 ++++++++++++++++-- .../Backends/Piper/PiperBackendUI.cs | 106 ++++++++++++---- src/TextToTalk/Backends/StreamFormat.cs | 2 + .../Backends/StreamingSoundQueue.cs | 28 +---- 5 files changed, 223 insertions(+), 61 deletions(-) diff --git a/src/TextToTalk.Tests/packages.lock.json b/src/TextToTalk.Tests/packages.lock.json index 4c576c91..c19dfe45 100644 --- a/src/TextToTalk.Tests/packages.lock.json +++ b/src/TextToTalk.Tests/packages.lock.json @@ -259,10 +259,36 @@ "Microsoft.ML.OnnxRuntime.Managed": "1.22.0" } }, + "Microsoft.ML.OnnxRuntime.Gpu": { + "type": "Transitive", + "resolved": "1.23.2", + "contentHash": "4GNQUc6FHiWHvp95Yhu95SUDa6HVm+RSQxm7QCH3PIlderDhTPdU98fHHKXmLy4xIQikkEraMcGe+KXEQU5tew==", + "dependencies": { + "Microsoft.ML.OnnxRuntime.Gpu.Linux": "1.23.2", + "Microsoft.ML.OnnxRuntime.Gpu.Windows": "1.23.2", + "Microsoft.ML.OnnxRuntime.Managed": "1.23.2" + } + }, + "Microsoft.ML.OnnxRuntime.Gpu.Linux": { + "type": "Transitive", + "resolved": "1.23.2", + "contentHash": "bcv2zpP8GNnfdUCkOjE9lzIoslAOCuY0T9QHpI5+Qm6qUcehRPtGC8wF4nvySwyfTe0g3rVINP3SSj1zinkE7Q==", + "dependencies": { + "Microsoft.ML.OnnxRuntime.Managed": "1.23.2" + } + }, + "Microsoft.ML.OnnxRuntime.Gpu.Windows": { + "type": "Transitive", + "resolved": "1.23.2", + "contentHash": "qOU3DVcxq4XalFV3wlrNrdatYWufIqvg8FZqVC3LS2rFPoTfl++xpMC2nnaxB2Wc5jrpDb2izrcDsQatCyjVnA==", + "dependencies": { + "Microsoft.ML.OnnxRuntime.Managed": "1.23.2" + } + }, "Microsoft.ML.OnnxRuntime.Managed": { "type": "Transitive", - "resolved": "1.22.0", - "contentHash": "zlG3eY5mJnx1BhYAxRwpuHCGHzl3B+cY5/se0RmlVBw6Yh6QTGjPAXdjhlBIcw6BPFhgMn9lxWPE/U3Fvis+BQ==", + "resolved": "1.23.2", + "contentHash": "HtlQuzmVrqhnkmwfmkQ+2re8xPxtVmeLRQaYSJ3pXfzKs4b36+yBfa/LnDuzfX1bGcyWn/McKxmbY87TCAmo1Q==", "dependencies": { "System.Numerics.Tensors": "9.0.0" } @@ -510,6 +536,7 @@ "Google.Cloud.TextToSpeech.V1": "[3.17.0, )", "KokoroSharp.CPU": "[0.6.1, )", "Microsoft.CognitiveServices.Speech": "[1.41.1, )", + "Microsoft.ML.OnnxRuntime.Gpu": "[1.23.2, )", "NAudio": "[2.2.1, )", "OpenAI": "[2.8.0, )", "PiperSharp": "[1.0.6, )", diff --git a/src/TextToTalk/Backends/Piper/PiperBackend.cs b/src/TextToTalk/Backends/Piper/PiperBackend.cs index ad7e1250..e1f3dd72 100644 --- a/src/TextToTalk/Backends/Piper/PiperBackend.cs +++ b/src/TextToTalk/Backends/Piper/PiperBackend.cs @@ -1,5 +1,6 @@ using Dalamud.Bindings.ImGui; using Dalamud.Game; +using Microsoft.ML.OnnxRuntime; using PiperSharp; using PiperSharp.Models; using Serilog; @@ -22,9 +23,10 @@ public class PiperBackend : VoiceBackend private readonly PiperBackendUI ui; private readonly StreamingSoundQueue soundQueue; private readonly Task modelTask; - private readonly CancellationTokenSource cts = new(); + private CancellationTokenSource cts = new(); private Process? piperServerProcess; + private readonly object processLock = new(); private string GetVoicesDir(PluginConfiguration config) => Path.Combine(config.GetPluginConfigDirectory(), "piper", "voices"); @@ -57,7 +59,7 @@ public async Task EnsurePiperAssetsDownloaded(PluginConfiguration config) var allModels = await PiperDownloader.GetHuggingFaceModelList(); var filteredModels = allModels - .Where(m => m.Key.StartsWith("en") && m.Key.EndsWith("medium")) + .Where(m => m.Key.StartsWith("en") && (m.Key.EndsWith("medium") || m.Key.EndsWith("low") || m.Key.EndsWith("high"))) .ToList(); foreach (var modelEntry in filteredModels) @@ -130,14 +132,14 @@ public override void Say(SayRequest request) if (!modelTask.IsCompletedSuccessfully) return; - Say(request.Text, voicePreset, request.Source); + Task.Run(async () => await Say(request.Text, (PiperVoicePreset)request.Voice, request.Source)); } public async Task Say(string text, PiperVoicePreset voicePreset, TextSource source) { - long methodStart = Stopwatch.GetTimestamp(); - long? timestampToPass = methodStart; + long? timestampToPass = Stopwatch.GetTimestamp(); + // 1. Validation if (string.IsNullOrEmpty(voicePreset.ModelPath) || !File.Exists(voicePreset.ModelPath)) { DetailedLog.Error($"Piper model file not found: {voicePreset.ModelPath}"); @@ -146,31 +148,120 @@ public async Task Say(string text, PiperVoicePreset voicePreset, TextSource sour try { + // 2. Prepare Model and Arguments var voiceDir = Path.GetDirectoryName(voicePreset.ModelPath); - var voiceModel = await VoiceModel.LoadModel(voiceDir); + piper.Configuration.Model = await VoiceModel.LoadModel(voiceDir); + piper.Configuration.SpeakingRate = 1.0f / (voicePreset.Speed ?? 1f); - piper.Configuration.Model = voiceModel; - piper.Configuration.SpeakingRate = 1.0f / voicePreset.Speed ?? 1f; + string args = piper.Configuration.BuildArguments(); - byte[] audioData = await piper.InferAsync(text, AudioOutputType.Raw, cts.Token); - if (audioData == null || audioData.Length == 0) return; - var audioStream = new MemoryStream(audioData); - soundQueue.EnqueueSound(audioStream, source, voicePreset.Volume ?? 1f, StreamFormat.Piper, null, timestampToPass); + // 3. Initialize Process + var process = new Process(); + process.StartInfo = new ProcessStartInfo + { + FileName = piper.Configuration.ExecutableLocation, + Arguments = args, + RedirectStandardInput = true, + RedirectStandardOutput = true, + UseShellExecute = false, + CreateNoWindow = true, + }; + + // 4. Thread-Safe Process Management + lock (processLock) + { + // Kill any dangling process before starting a new one + KillActiveProcessInternal(); + piperServerProcess = process; + } + + // 5. THE CANCELLATION BRIDGE + using var registration = cts.Token.Register(() => KillActiveProcessInternal()); + + process.Start(); + + // 6. Check for cancellation before writing to the pipe + if (cts.Token.IsCancellationRequested) + throw new OperationCanceledException(cts.Token); + + // 7. Write Text to StandardInput + using (var sw = new StreamWriter(process.StandardInput.BaseStream, leaveOpen: false)) + { + await sw.WriteLineAsync(text); + await sw.FlushAsync(); + } + + // 8. Determine Audio Format + var format = voicePreset.InternalName switch + { + string name when name.EndsWith("low") => StreamFormat.PiperLow, + string name when name.EndsWith("high") => StreamFormat.PiperHigh, + _ => StreamFormat.Piper // Defaults to Medium/Standard + }; + + // 9. Enqueue Stream + soundQueue.EnqueueSound(process.StandardOutput.BaseStream, source, voicePreset.Volume ?? 1f, format, null, timestampToPass); + + // 10. Await process exit + await process.WaitForExitAsync(cts.Token); + } + catch (OperationCanceledException) + { + Log.Information("Piper synthesis task was cancelled."); } catch (Exception ex) { - DetailedLog.Error($"Piper switching/inference failed: {ex.Message}"); + DetailedLog.Error($"Piper streaming failed: {ex.Message}"); + } + finally + { + KillActiveProcessInternal(); + } + } + + private void KillActiveProcessInternal() + { + lock (processLock) + { + if (piperServerProcess != null) + { + try + { + piperServerProcess.Kill(true); + } + catch (Exception ex) + { + DetailedLog.Debug($"Error killing piper process: {ex.Message}"); + } + finally + { + piperServerProcess.Dispose(); + piperServerProcess = null; + } + } } } public override void CancelAllSpeech() { + KillActiveProcessInternal(); + soundQueue.CancelAllSounds(); + soundQueue.StopHardware(); + cts.Cancel(); + cts.Dispose(); + cts = new CancellationTokenSource(); } public override void CancelSay(TextSource source) { + KillActiveProcessInternal(); + soundQueue.CancelFromSource(source); + soundQueue.StopHardware(); + cts.Cancel(); + cts.Dispose(); + cts = new CancellationTokenSource(); } public override void DrawSettings(IConfigUIDelegates helpers) diff --git a/src/TextToTalk/Backends/Piper/PiperBackendUI.cs b/src/TextToTalk/Backends/Piper/PiperBackendUI.cs index cfb0cdb3..b5fb0b53 100644 --- a/src/TextToTalk/Backends/Piper/PiperBackendUI.cs +++ b/src/TextToTalk/Backends/Piper/PiperBackendUI.cs @@ -1,8 +1,10 @@ using Dalamud.Bindings.ImGui; using Dalamud.Utility; using System; +using System.Collections.Generic; using System.IO; using System.Linq; +using System.Text.Json; using TextToTalk.UI; namespace TextToTalk.Backends.Piper; @@ -13,6 +15,41 @@ public class PiperBackendUI(PluginConfiguration config, PiperBackend piperBacken private string[] modelDisplayNames; private int selectedModelIndex = -1; + public class PiperModelInfo + { + public string FullPath { get; set; } + public string DisplayName { get; set; } + public string Quality { get; set; } // low, medium, high + + public static PiperModelInfo FromPath(string onnxPath) + { + var jsonPath = onnxPath + ".json"; + if (!File.Exists(jsonPath)) return null; + + try + { + var json = File.ReadAllText(jsonPath); + using var doc = JsonDocument.Parse(json); + var root = doc.RootElement; + + var lang = root.GetProperty("language").GetProperty("code").GetString(); + var dataset = root.GetProperty("dataset").GetString(); + var quality = root.GetProperty("audio").GetProperty("quality").GetString(); + + return new PiperModelInfo + { + FullPath = onnxPath, + DisplayName = $"{lang}: {dataset} ({quality})", + Quality = quality?.ToLower() ?? "medium" + }; + } + catch { return null; } + } + } + + private List sortedModels = new(); + private string[] sortedDisplayNames = Array.Empty(); + public void DrawVoicePresetOptions() { ImGui.TextColored(ImColor.HintColor, "Piper is a local neural TTS engine. Ensure you have downloaded models."); @@ -58,36 +95,63 @@ public void DrawVoicePresetOptions() config.Save(); } - // --- Model Selection --- - var piperDir = Path.Combine(config.GetPluginConfigDirectory(), "piper"); - var voicesDir = Path.Combine(piperDir, "voices"); + // Model Selection + var voicesDir = Path.Combine(config.GetPluginConfigDirectory(), "piper", "voices"); + if (!Directory.Exists(voicesDir)) Directory.CreateDirectory(voicesDir); - if (!Directory.Exists(voicesDir)) - Directory.CreateDirectory(voicesDir); + var files = Directory.GetFiles(voicesDir, "*.onnx", SearchOption.AllDirectories); + var allModels = files.Select(PiperModelInfo.FromPath).Where(m => m != null).ToList(); - availableModelPaths = Directory.GetFiles(voicesDir, "*.onnx", SearchOption.AllDirectories); - modelDisplayNames = availableModelPaths.Select(Path.GetFileName).ToArray(); - - if (availableModelPaths.Length > 0) + if (allModels.Count > 0) { - if (selectedModelIndex == -1 || - selectedModelIndex >= availableModelPaths.Length || - availableModelPaths[selectedModelIndex] != currentVoicePreset.ModelPath) - { - selectedModelIndex = Array.IndexOf(availableModelPaths, currentVoicePreset.ModelPath ?? ""); - if (selectedModelIndex == -1) selectedModelIndex = 0; - } + ImGui.Text("Voice Model Selection"); + + var currentModel = allModels.FirstOrDefault(m => m.FullPath == currentVoicePreset.ModelPath); + string previewValue = currentModel?.DisplayName ?? "Select a model..."; - if (ImGui.Combo($"Voice Model (.onnx)##{MemoizedId.Create()}", ref selectedModelIndex, modelDisplayNames, modelDisplayNames.Length)) + if (ImGui.BeginCombo($"##ModelSelect{MemoizedId.Create()}", previewValue)) { - currentVoicePreset.ModelPath = availableModelPaths[selectedModelIndex]; - currentVoicePreset.InternalName = modelDisplayNames[selectedModelIndex].Replace(".onnx", ""); - config.Save(); + var qualities = new[] { "high", "medium", "low" }; + + foreach (var quality in qualities) + { + var modelsInSection = allModels.Where(m => m.Quality == quality).OrderBy(m => m.DisplayName).ToList(); + + if (modelsInSection.Count > 0) + { + string headerText = quality switch + { + "high" => "HIGH Quality - 24khz - higher latency", + "medium" => "MEDIUM Quality - 22.5khz - mid latency", + "low" => "LOW Quality - 16khz - lower latency", + _ => quality.ToUpper() + }; + + ImGui.Spacing(); + ImGui.TextDisabled(headerText); + ImGui.Separator(); + + foreach (var model in modelsInSection) + { + bool isSelected = currentVoicePreset.ModelPath == model.FullPath; + + if (ImGui.Selectable($"{model.DisplayName}##{model.FullPath}", isSelected)) + { + currentVoicePreset.ModelPath = model.FullPath; + currentVoicePreset.InternalName = Path.GetFileNameWithoutExtension(model.FullPath); + config.Save(); + } + + if (isSelected) ImGui.SetItemDefaultFocus(); + } + } + } + ImGui.EndCombo(); } } else { - ImGui.TextColored(ImColor.Red, $"No .onnx models found in subdirectories of: {voicesDir}"); + ImGui.TextColored(ImColor.Red, "No voice models found."); } if (ImGui.Button($"Download Models##{MemoizedId.Create()}")) diff --git a/src/TextToTalk/Backends/StreamFormat.cs b/src/TextToTalk/Backends/StreamFormat.cs index 3404baf6..607e9987 100644 --- a/src/TextToTalk/Backends/StreamFormat.cs +++ b/src/TextToTalk/Backends/StreamFormat.cs @@ -9,4 +9,6 @@ public enum StreamFormat System, Uberduck, Piper, + PiperLow, + PiperHigh, } \ No newline at end of file diff --git a/src/TextToTalk/Backends/StreamingSoundQueue.cs b/src/TextToTalk/Backends/StreamingSoundQueue.cs index 3a24ecd3..03099585 100644 --- a/src/TextToTalk/Backends/StreamingSoundQueue.cs +++ b/src/TextToTalk/Backends/StreamingSoundQueue.cs @@ -144,12 +144,13 @@ private void ProcessMp3Stream(StreamingSoundQueueItem nextItem) private void ProcessRawPcmStream(StreamingSoundQueueItem nextItem) { - // Resolve format WaveFormat chunkFormat = nextItem.Format switch { StreamFormat.Wave => Wave, StreamFormat.Azure => Azure, - StreamFormat.Piper => Wave, + StreamFormat.Piper => Uberduck, + StreamFormat.PiperLow => Azure, + StreamFormat.PiperHigh => Wave, _ => throw new NotSupportedException($"Format {nextItem.Format} requires a decompressor."), }; @@ -167,10 +168,8 @@ private void ProcessRawPcmStream(StreamingSoundQueueItem nextItem) ApplyVolumeToPcmBuffer(chunkBuffer, bytesRead, nextItem.Volume); this.bufferedProvider.AddSamples(chunkBuffer, 0, bytesRead); - // 2. Start hardware if it's not already playing if (this.bufferedProvider.BufferedBytes > 16384 && this.soundOut.PlaybackState != PlaybackState.Playing) { - // 1. Log latency immediately when we start processing the item if (nextItem.StartTime.HasValue) { var elapsed = Stopwatch.GetElapsedTime(nextItem.StartTime.Value); @@ -181,28 +180,7 @@ private void ProcessRawPcmStream(StreamingSoundQueueItem nextItem) latencyLogged = true; } } - - - // 3. WAIT AND STOP: Release the hardware lock once this item (and any previous) is finished - // This is critical to ensure the NEXT item sees the state as 'Not Playing' - // This is necessary because of some weirdness with the Piper backend. This code is not compatible with Azure - if (nextItem.Format == StreamFormat.Piper) - { - while (this.bufferedProvider.BufferedBytes > 0) - { - // Small sleep to prevent CPU spiking while waiting for hardware to finish the buffer - Thread.Sleep(10); - } - - if (this.soundOut.PlaybackState == PlaybackState.Playing) - { - this.soundOut.Stop(); // This resets the state to Stopped/NotPlaying - Log.Debug("Playback finished, hardware stopped and lock released."); - } - } } - - // 4. Dispose the stream after playback is complete nextItem.Data.Dispose(); } From 098cfe50fac0366c414fdfaee987be1307e3afc2 Mon Sep 17 00:00:00 2001 From: mitcheb0219 Date: Wed, 14 Jan 2026 14:31:24 -0500 Subject: [PATCH 05/13] Added Voice Downloader in Piper for user control --- src/TextToTalk/Backends/Piper/PiperBackend.cs | 137 ++++++++++--- .../Backends/Piper/PiperBackendUI.cs | 182 +++++++++++++++++- 2 files changed, 286 insertions(+), 33 deletions(-) diff --git a/src/TextToTalk/Backends/Piper/PiperBackend.cs b/src/TextToTalk/Backends/Piper/PiperBackend.cs index e1f3dd72..e247ea3c 100644 --- a/src/TextToTalk/Backends/Piper/PiperBackend.cs +++ b/src/TextToTalk/Backends/Piper/PiperBackend.cs @@ -5,6 +5,7 @@ using PiperSharp.Models; using Serilog; using System; +using System.Collections.Generic; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.IO; @@ -24,26 +25,37 @@ public class PiperBackend : VoiceBackend private readonly StreamingSoundQueue soundQueue; private readonly Task modelTask; private CancellationTokenSource cts = new(); + private readonly PluginConfiguration config; private Process? piperServerProcess; private readonly object processLock = new(); - private string GetVoicesDir(PluginConfiguration config) => + public string GetVoicesDir(PluginConfiguration config) => Path.Combine(config.GetPluginConfigDirectory(), "piper", "voices"); public PiperBackend(PluginConfiguration config) { - ui = new PiperBackendUI(config, this); - string piperExe = Path.Join(config.GetPluginConfigDirectory(), "piper", "piper.exe"); + this.ui = new PiperBackendUI(config, this); - piper = new PiperProvider(new PiperConfiguration() + // 1. Point to the nested 'piper' subfolder created by ExtractPiper + string piperBaseDir = Path.Combine(config.GetPluginConfigDirectory(), "piper", "piper"); + string piperExe = Path.Combine(piperBaseDir, "piper.exe"); + + this.piper = new PiperProvider(new PiperConfiguration() { ExecutableLocation = piperExe, - WorkingDirectory = Path.GetDirectoryName(piperExe) + // 2. Set the working directory to the folder containing the .exe + WorkingDirectory = piperBaseDir }); - modelTask = LoadOrDownloadModelAsync(config); - soundQueue = new StreamingSoundQueue(config); + this.modelTask = LoadOrDownloadModelAsync(config); + this.soundQueue = new StreamingSoundQueue(config); + this.config = config; + } + + public async Task> GetAvailableModels() + { + return await PiperDownloader.GetHuggingFaceModelList(); } public static bool IsModelFileDownloaded(PluginConfiguration config) { @@ -51,37 +63,115 @@ public static bool IsModelFileDownloaded(PluginConfiguration config) return File.Exists(piperExePath); } + /// + /// Downloads a specific model and initializes its folder structure. + /// + public async Task DownloadSpecificModel(string modelKey, VoiceModel entry) + { + string voicesDir = GetVoicesDir(config); + string modelTargetDir = Path.Combine(voicesDir, modelKey); + + try + { + DetailedLog.Info($"Downloading voice: {modelKey}"); + // Downloads into voicesDir/modelKey/ + await entry.DownloadModel(voicesDir); + + // Prepare the model.json for PiperSharp + string onnxPath = Path.Combine(modelTargetDir, $"{modelKey}.onnx"); + await LoadSpecificVoiceModel(onnxPath); + + DetailedLog.Info($"Successfully installed {modelKey}"); + } + catch (Exception ex) + { + DetailedLog.Error($"Failed to download {modelKey}: {ex.Message}"); + } + } + + public bool DeleteVoiceModel(string modelKey) + { + try + { + // 1. Kill any active speech to unlock files + KillActiveProcessInternal(); + + string voicesDir = GetVoicesDir(config); + string modelTargetDir = Path.Combine(voicesDir, modelKey); + + if (Directory.Exists(modelTargetDir)) + { + // Delete the folder and all contents (.onnx, .json) + Directory.Delete(modelTargetDir, true); + DetailedLog.Info($"Deleted voice model: {modelKey}"); + return true; + } + } + catch (Exception ex) + { + DetailedLog.Error($"Failed to delete {modelKey}: {ex.Message}"); + } + return false; + } + /// Downloads the Piper executable and initial voice models. public async Task EnsurePiperAssetsDownloaded(PluginConfiguration config) { string configDir = config.GetPluginConfigDirectory(); string voicesDir = GetVoicesDir(config); + + // Ensure the executable is downloaded first + await EnsureExecutableDownloaded(config); + + // Fetch all available models from Hugging Face var allModels = await PiperDownloader.GetHuggingFaceModelList(); - var filteredModels = allModels - .Where(m => m.Key.StartsWith("en") && (m.Key.EndsWith("medium") || m.Key.EndsWith("low") || m.Key.EndsWith("high"))) - .ToList(); + // TARGET: Only download "en_US-lessac-medium" initially + string starterModelKey = "en_US-lessac-medium"; - foreach (var modelEntry in filteredModels) + if (allModels.TryGetValue(starterModelKey, out var modelEntry)) { - string modelKey = modelEntry.Key; - string modelTargetDir = Path.Combine(voicesDir, modelKey); - - if (File.Exists(Path.Combine(modelTargetDir, $"{modelKey}.onnx"))) continue; + string modelTargetDir = Path.Combine(voicesDir, starterModelKey); - try + // Skip if already downloaded + if (!File.Exists(Path.Combine(modelTargetDir, $"{starterModelKey}.onnx"))) { - DetailedLog.Info($"Downloading English medium voice: {modelKey}"); - await modelEntry.Value.DownloadModel(voicesDir); + try + { + DetailedLog.Info($"Downloading starter English voice: {starterModelKey}"); + + // Downloads the model to voicesDir/en_US-lessac-medium/ + await modelEntry.DownloadModel(voicesDir); - string onnxPath = Path.Combine(modelTargetDir, $"{modelKey}.onnx"); - await LoadSpecificVoiceModel(onnxPath); + string onnxPath = Path.Combine(modelTargetDir, $"{starterModelKey}.onnx"); + + // Initialize the model config and directory + await LoadSpecificVoiceModel(onnxPath); + } + catch (Exception ex) + { + DetailedLog.Error($"Failed to download starter voice {starterModelKey}: {ex.Message}"); + } } - catch (Exception ex) + else { - DetailedLog.Error($"Failed to download {modelKey}: {ex.Message}"); + DetailedLog.Info($"Starter voice {starterModelKey} already exists."); } } + else + { + DetailedLog.Error($"Starter voice {starterModelKey} not found in the Hugging Face model list."); + } + } + + private async Task EnsureExecutableDownloaded(PluginConfiguration config) + { + if (!IsModelFileDownloaded(config)) + { + string piperDir = Path.Combine(config.GetPluginConfigDirectory(), "piper"); + DetailedLog.Info("Piper executable missing. Downloading..."); + await PiperDownloader.DownloadPiper().ExtractPiper(piperDir); + } } private async Task LoadOrDownloadModelAsync(PluginConfiguration config) @@ -165,6 +255,7 @@ public async Task Say(string text, PiperVoicePreset voicePreset, TextSource sour RedirectStandardOutput = true, UseShellExecute = false, CreateNoWindow = true, + }; // 4. Thread-Safe Process Management @@ -219,7 +310,7 @@ string name when name.EndsWith("high") => StreamFormat.PiperHigh, } } - private void KillActiveProcessInternal() + public void KillActiveProcessInternal() { lock (processLock) { diff --git a/src/TextToTalk/Backends/Piper/PiperBackendUI.cs b/src/TextToTalk/Backends/Piper/PiperBackendUI.cs index b5fb0b53..8d9cfd8e 100644 --- a/src/TextToTalk/Backends/Piper/PiperBackendUI.cs +++ b/src/TextToTalk/Backends/Piper/PiperBackendUI.cs @@ -5,7 +5,10 @@ using System.IO; using System.Linq; using System.Text.Json; +using System.Threading.Tasks; using TextToTalk.UI; +using PiperSharp; +using PiperSharp.Models; namespace TextToTalk.Backends.Piper; @@ -15,12 +18,21 @@ public class PiperBackendUI(PluginConfiguration config, PiperBackend piperBacken private string[] modelDisplayNames; private int selectedModelIndex = -1; + private bool showDownloader = false; + private IDictionary remoteModels; + private string searchQuery = ""; + private List cachedModels = new(); // For live list updates + private DateTime lastScan = DateTime.MinValue; + private bool isScanning = false; + public class PiperModelInfo { public string FullPath { get; set; } public string DisplayName { get; set; } public string Quality { get; set; } // low, medium, high + + public static PiperModelInfo FromPath(string onnxPath) { var jsonPath = onnxPath + ".json"; @@ -49,7 +61,8 @@ public static PiperModelInfo FromPath(string onnxPath) private List sortedModels = new(); private string[] sortedDisplayNames = Array.Empty(); - + private string voicesFolderSize = "0 MB"; + public void DrawVoicePresetOptions() { ImGui.TextColored(ImColor.HintColor, "Piper is a local neural TTS engine. Ensure you have downloaded models."); @@ -95,16 +108,45 @@ public void DrawVoicePresetOptions() config.Save(); } + // 1. REFACTORED MODEL SCANNING (Async polling for new downloads) + if (!isScanning && (DateTime.Now - lastScan).TotalSeconds > 3) + { + lastScan = DateTime.Now; + isScanning = true; + + Task.Run(() => + { + try + { + var voicesDir = Path.Combine(config.GetPluginConfigDirectory(), "piper", "voices"); + if (Directory.Exists(voicesDir)) + { + // 1. Existing Model Scanning logic + var files = Directory.GetFiles(voicesDir, "*.onnx", SearchOption.AllDirectories); + cachedModels = files.Select(PiperModelInfo.FromPath).Where(m => m != null).ToList(); + + // 2. NEW: Calculate Folder Size + var dirInfo = new DirectoryInfo(voicesDir); + // Sum all files in all subdirectories + long totalBytes = dirInfo.EnumerateFiles("*", SearchOption.AllDirectories).Sum(fi => fi.Length); + + // Convert to MB and format + voicesFolderSize = $"{(totalBytes / 1024f / 1024f):N0} MB"; + } + } + finally { isScanning = false; } + }); + } + // Model Selection var voicesDir = Path.Combine(config.GetPluginConfigDirectory(), "piper", "voices"); if (!Directory.Exists(voicesDir)) Directory.CreateDirectory(voicesDir); var files = Directory.GetFiles(voicesDir, "*.onnx", SearchOption.AllDirectories); var allModels = files.Select(PiperModelInfo.FromPath).Where(m => m != null).ToList(); - - if (allModels.Count > 0) + if (cachedModels.Count > 0) { - ImGui.Text("Voice Model Selection"); + var currentModel = allModels.FirstOrDefault(m => m.FullPath == currentVoicePreset.ModelPath); string previewValue = currentModel?.DisplayName ?? "Select a model..."; @@ -148,18 +190,14 @@ public void DrawVoicePresetOptions() } ImGui.EndCombo(); } + ImGui.SameLine(); + ImGui.Text("Voice Model Selection"); } else { ImGui.TextColored(ImColor.Red, "No voice models found."); } - if (ImGui.Button($"Download Models##{MemoizedId.Create()}")) - { - _ = piperBackend.EnsurePiperAssetsDownloaded(config); - } - Components.Tooltip("Will download all English voices at medium quality (appx 1.5GB)"); - // --- Voice Parameters --- var speed = currentVoicePreset.Speed ?? 1f; if (ImGui.SliderFloat($"Speed##{MemoizedId.Create()}", ref speed, 0.5f, 3.0f, "%.2fx")) @@ -184,6 +222,35 @@ public void DrawVoicePresetOptions() TextSource.Chat); } } + ImGui.SameLine(); + if (ImGui.Button($"Open Voice Downloader##{MemoizedId.Create()}")) + { + showDownloader = true; + // 2026 Best Practice: Fetch manifest on a background thread to prevent FFXIV frame drops + Task.Run(async () => + { + try + { + // Ensure this returns IDictionary or similar + var models = await piperBackend.GetAvailableModels(); + // Cast or assign to your IDictionary field + remoteModels = models.ToDictionary(k => k.Key, v => (VoiceModel)v.Value); + } + catch (Exception ex) + { + DetailedLog.Error($"Failed to fetch Piper manifest: {ex.Message}"); + } + }); + } + Components.Tooltip("Browse and download specific Piper voices from Hugging Face."); + ImGui.SameLine(); + ImGui.TextDisabled($"Local Storage Used: {voicesFolderSize}"); + if (ImGui.IsItemHovered()) + { + ImGui.SetTooltip("Total disk space used by downloaded Piper voice models."); + } + // Render the window if active + if (showDownloader) DrawVoiceDownloader(); ImGui.Separator(); @@ -193,4 +260,99 @@ public void DrawVoicePresetOptions() BackendUI.GenderedPresetConfig("Piper", TTSBackend.Piper, config, presets); } } + private void DrawVoiceDownloader() + { + ImGui.SetNextWindowSize(new global::System.Numerics.Vector2(500, 600), ImGuiCond.FirstUseEver); + if (ImGui.Begin("Piper Voice Downloader", ref showDownloader)) + { + if (remoteModels == null) + { + ImGui.Text("Fetching model list from Hugging Face..."); + ImGui.End(); + return; + } + + ImGui.InputTextWithHint("##Search", "Search voices (e.g. 'en_US' or 'medium')...", ref searchQuery, 64); + + ImGui.BeginChild("ModelList", new global::System.Numerics.Vector2(0, 0), true); + foreach (var model in remoteModels) + { + + var entry = model.Value; + var langName = entry.Language?.Name ?? "Unknown"; + var dataset = entry.Name ?? "Standard"; + var parts = (entry.Key ?? "unknown").Split(new[] { '-', '_' }, StringSplitOptions.RemoveEmptyEntries); + string quality = parts.Last().ToLower(); + + string formattedName = $"{langName} : {dataset} ({quality})"; + + if (!string.IsNullOrEmpty(searchQuery) && !model.Key.Contains(searchQuery, StringComparison.OrdinalIgnoreCase)) + continue; + + // Check if installed + bool isDownloaded = cachedModels.Any(m => m.FullPath.Contains(model.Key)); + bool isInstalled = cachedModels.Any(m => m.FullPath.Contains(Path.Combine("voices", model.Key))); + + ImGui.PushID(model.Key); + ImGui.TextUnformatted(formattedName); + ImGui.SameLine(ImGui.GetWindowWidth() - 120); + + if (isInstalled) + { + ImGui.TextColored(new global::System.Numerics.Vector4(0.5f, 1f, 0.5f, 1f), "Installed"); + + // Add a small delete button to the right of the "Installed" text + ImGui.SameLine(ImGui.GetWindowWidth() - 40); + ImGui.PushStyleColor(ImGuiCol.Button, new global::System.Numerics.Vector4(0.6f, 0.2f, 0.2f, 1f)); // Reddish + if (ImGui.Button("X##Delete")) + { + // Trigger deletion + if (piperBackend.DeleteVoiceModel(model.Key)) + { + // Force a re-scan of the local files immediately so the UI updates + lastScan = DateTime.MinValue; + } + } + ImGui.PopStyleColor(); + if (ImGui.IsItemHovered()) ImGui.SetTooltip("Delete this voice model from your computer."); + } + else + { + if (ImGui.Button("Download")) + { + _ = piperBackend.DownloadSpecificModel(model.Key, (VoiceModel)model.Value); + } + } + ImGui.Separator(); + ImGui.PopID(); + } + ImGui.EndChild(); + ImGui.End(); + } + + } + public bool DeleteVoiceModel(string modelKey) + { + try + { + // 1. Kill any active speech to unlock files + piperBackend.KillActiveProcessInternal(); + + string voicesDir = piperBackend.GetVoicesDir(config); + string modelTargetDir = Path.Combine(voicesDir, modelKey); + + if (Directory.Exists(modelTargetDir)) + { + // Delete the folder and all contents (.onnx, .json) + Directory.Delete(modelTargetDir, true); + DetailedLog.Info($"Deleted voice model: {modelKey}"); + return true; + } + } + catch (Exception ex) + { + DetailedLog.Error($"Failed to delete {modelKey}: {ex.Message}"); + } + return false; + } } \ No newline at end of file From 0f5b2303f8e0c03c3b2998232e17650c615c4543 Mon Sep 17 00:00:00 2001 From: mitcheb0219 Date: Wed, 14 Jan 2026 21:47:05 -0500 Subject: [PATCH 06/13] Fixed up Piper Downloader Tool --- .../Backends/Piper/PiperBackendUI.cs | 78 ++++++++++++++++++- .../Backends/System/SystemBackend.cs | 3 +- .../TextProviders/ChatMessageHandler.cs | 14 ++-- src/TextToTalk/TextToTalk.cs | 2 + 4 files changed, 86 insertions(+), 11 deletions(-) diff --git a/src/TextToTalk/Backends/Piper/PiperBackendUI.cs b/src/TextToTalk/Backends/Piper/PiperBackendUI.cs index 8d9cfd8e..7afa1160 100644 --- a/src/TextToTalk/Backends/Piper/PiperBackendUI.cs +++ b/src/TextToTalk/Backends/Piper/PiperBackendUI.cs @@ -25,6 +25,8 @@ public class PiperBackendUI(PluginConfiguration config, PiperBackend piperBacken private DateTime lastScan = DateTime.MinValue; private bool isScanning = false; + private HashSet activeDownloads = new HashSet(); + public class PiperModelInfo { public string FullPath { get; set; } @@ -63,6 +65,38 @@ public static PiperModelInfo FromPath(string onnxPath) private string[] sortedDisplayNames = Array.Empty(); private string voicesFolderSize = "0 MB"; + private void DrawLoadingSpinner(string label, float radius, float thickness, uint color) + { + // 1. Get current cursor position to draw + var pos = ImGui.GetCursorScreenPos(); + var size = new global::System.Numerics.Vector2(radius * 2, radius * 2); + + // 2. Reserve space in the ImGui layout so other elements don't overlap + ImGui.Dummy(size); + + // 3. Define the center of our circle + var center = new global::System.Numerics.Vector2(pos.X + radius, pos.Y + radius); + var drawList = ImGui.GetWindowDrawList(); + + // 4. Calculate animation timing + float time = (float)ImGui.GetTime(); + int numSegments = 30; + float startAngle = time * 8.0f; // Rotation speed + + // 5. Build the arc path (approx. 270 degrees) + drawList.PathClear(); + for (int i = 0; i <= numSegments; i++) + { + float a = startAngle + ((float)i / numSegments) * (MathF.PI * 1.5f); + drawList.PathLineTo(new global::System.Numerics.Vector2( + center.X + MathF.Cos(a) * radius, + center.Y + MathF.Sin(a) * radius)); + } + + // 6. Draw the stroke + drawList.PathStroke(color, ImDrawFlags.None, thickness); + } + public void DrawVoicePresetOptions() { ImGui.TextColored(ImColor.HintColor, "Piper is a local neural TTS engine. Ensure you have downloaded models."); @@ -262,6 +296,7 @@ public void DrawVoicePresetOptions() } private void DrawVoiceDownloader() { + ImGui.SetNextWindowSize(new global::System.Numerics.Vector2(500, 600), ImGuiCond.FirstUseEver); if (ImGui.Begin("Piper Voice Downloader", ref showDownloader)) { @@ -279,14 +314,26 @@ private void DrawVoiceDownloader() { var entry = model.Value; - var langName = entry.Language?.Name ?? "Unknown"; + var langCode = entry.Language?.Code ?? "unknown"; + + var langName = langCode.ToLower().Replace("-", "_") switch + { + "en_gb" => "English - UK", + "en_us" => "English - US", + "es_ar" => "Spanish - AR", + "es_es" => "Spanish - ES", + "es_mx" => "Spanish - MX", + "nl_be" => "Dutch - BE", + "nl_nl" => "Dutch - NL", + _ => entry.Language?.Name ?? "Unknown" // Fallback to original name + }; var dataset = entry.Name ?? "Standard"; var parts = (entry.Key ?? "unknown").Split(new[] { '-', '_' }, StringSplitOptions.RemoveEmptyEntries); string quality = parts.Last().ToLower(); string formattedName = $"{langName} : {dataset} ({quality})"; - if (!string.IsNullOrEmpty(searchQuery) && !model.Key.Contains(searchQuery, StringComparison.OrdinalIgnoreCase)) + if (!string.IsNullOrEmpty(searchQuery) && !formattedName.Contains(searchQuery, StringComparison.OrdinalIgnoreCase)) continue; // Check if installed @@ -318,9 +365,32 @@ private void DrawVoiceDownloader() } else { - if (ImGui.Button("Download")) + // CHECK: Is this specific model currently downloading? + if (activeDownloads.Contains(model.Key)) { - _ = piperBackend.DownloadSpecificModel(model.Key, (VoiceModel)model.Value); + // Place the spinner where the button would normally be + DrawLoadingSpinner($"##spinner_{model.Key}", 10.0f, 3.0f, ImGui.GetColorU32(ImGuiCol.ButtonHovered)); + ImGui.SameLine(); + ImGui.Text("Downloading..."); + } + else + { + if (ImGui.Button("Download")) + { + activeDownloads.Add(model.Key); + + // Use a Task.Run or Ensure the continuation happens correctly + _ = piperBackend.DownloadSpecificModel(model.Key, (VoiceModel)model.Value) + .ContinueWith(t => + { + // 1. Force your scanner to see the new files immediately + // Assuming 'lastScan' is what triggers your cachedModels update loop + lastScan = DateTime.MinValue; + + // 2. Remove from active downloads AFTER the scan is flagged + activeDownloads.Remove(model.Key); + }); + } } } ImGui.Separator(); diff --git a/src/TextToTalk/Backends/System/SystemBackend.cs b/src/TextToTalk/Backends/System/SystemBackend.cs index 00887ecb..0df3f02c 100644 --- a/src/TextToTalk/Backends/System/SystemBackend.cs +++ b/src/TextToTalk/Backends/System/SystemBackend.cs @@ -1,4 +1,5 @@ -using System; +using Serilog; +using System; using System.Diagnostics; using System.Net.Http; using System.Threading; diff --git a/src/TextToTalk/TextProviders/ChatMessageHandler.cs b/src/TextToTalk/TextProviders/ChatMessageHandler.cs index f3301613..1f05de0e 100644 --- a/src/TextToTalk/TextProviders/ChatMessageHandler.cs +++ b/src/TextToTalk/TextProviders/ChatMessageHandler.cs @@ -127,12 +127,14 @@ private void ProcessChatMessage(ChatMessage chatMessage) if (!this.filters.OnlyMessagesFromYou(speaker?.Name.TextValue ?? sender.TextValue)) return; if (!this.filters.ShouldSayFromYou(speaker?.Name.TextValue ?? sender.TextValue)) return; - - OnTextEmit.Invoke(new ChatTextEmitEvent( - GetCleanSpeakerName(speaker, sender), - textValue, - speaker, - type)); + + else if (type == XivChatType.TellOutgoing && config.SkipMessagesFromYou == true) return; + + OnTextEmit.Invoke(new ChatTextEmitEvent( + GetCleanSpeakerName(speaker, sender), + textValue, + speaker, + type)); } private static SeString GetCleanSpeakerName(IGameObject? speaker, SeString sender) diff --git a/src/TextToTalk/TextToTalk.cs b/src/TextToTalk/TextToTalk.cs index fc444030..5e06fa40 100644 --- a/src/TextToTalk/TextToTalk.cs +++ b/src/TextToTalk/TextToTalk.cs @@ -44,6 +44,8 @@ using static System.Net.Mime.MediaTypeNames; using GameObject = Dalamud.Game.ClientState.Objects.Types.IGameObject; +using Serilog; + namespace TextToTalk { public partial class TextToTalk : IDalamudPlugin From dcff8c54c032cd7a0a1754a7d6af685b1fc5966a Mon Sep 17 00:00:00 2001 From: mitcheb0219 Date: Wed, 14 Jan 2026 22:36:39 -0500 Subject: [PATCH 07/13] Cleaned up Piper UI Model Selection --- .../Backends/Piper/PiperBackendUI.cs | 90 +++++++++++-------- 1 file changed, 53 insertions(+), 37 deletions(-) diff --git a/src/TextToTalk/Backends/Piper/PiperBackendUI.cs b/src/TextToTalk/Backends/Piper/PiperBackendUI.cs index 7afa1160..e4e55ebe 100644 --- a/src/TextToTalk/Backends/Piper/PiperBackendUI.cs +++ b/src/TextToTalk/Backends/Piper/PiperBackendUI.cs @@ -30,10 +30,9 @@ public class PiperBackendUI(PluginConfiguration config, PiperBackend piperBacken public class PiperModelInfo { public string FullPath { get; set; } - public string DisplayName { get; set; } - public string Quality { get; set; } // low, medium, high - - + public string DisplayName { get; set; } // The dataset name (e.g., "Lessac") + public string Quality { get; set; } + public string LanguageName { get; set; } // The pretty name (e.g., "English - US") public static PiperModelInfo FromPath(string onnxPath) { @@ -46,15 +45,19 @@ public static PiperModelInfo FromPath(string onnxPath) using var doc = JsonDocument.Parse(json); var root = doc.RootElement; - var lang = root.GetProperty("language").GetProperty("code").GetString(); + var langCode = root.GetProperty("language").GetProperty("code").GetString(); + var langPlain = root.GetProperty("language").GetProperty("name_english").GetString(); + + // Pass both code and plain name to your disambiguation helper + var prettyLang = GetPrettyLanguageName(langCode, langPlain); var dataset = root.GetProperty("dataset").GetString(); - var quality = root.GetProperty("audio").GetProperty("quality").GetString(); return new PiperModelInfo { FullPath = onnxPath, - DisplayName = $"{lang}: {dataset} ({quality})", - Quality = quality?.ToLower() ?? "medium" + LanguageName = prettyLang, + DisplayName = dataset ?? "Unknown", + Quality = root.GetProperty("audio").GetProperty("quality").GetString()?.ToLower() ?? "medium" }; } catch { return null; } @@ -65,6 +68,23 @@ public static PiperModelInfo FromPath(string onnxPath) private string[] sortedDisplayNames = Array.Empty(); private string voicesFolderSize = "0 MB"; + public static string GetPrettyLanguageName(string code, string fallbackName) + { + if (string.IsNullOrEmpty(code)) return fallbackName ?? "Unknown"; + + return code.ToLower().Replace("-", "_") switch + { + "en_gb" => "English - UK", + "en_us" => "English - US", + "es_ar" => "Spanish - AR", + "es_es" => "Spanish - ES", + "es_mx" => "Spanish - MX", + "nl_be" => "Dutch - BE", + "nl_nl" => "Dutch - NL", + _ => fallbackName // Use "English", "French", etc. from JSON + }; + } + private void DrawLoadingSpinner(string label, float radius, float thickness, uint color) { // 1. Get current cursor position to draw @@ -178,48 +198,44 @@ public void DrawVoicePresetOptions() var files = Directory.GetFiles(voicesDir, "*.onnx", SearchOption.AllDirectories); var allModels = files.Select(PiperModelInfo.FromPath).Where(m => m != null).ToList(); - if (cachedModels.Count > 0) - { - + if (allModels.Count > 0) + { var currentModel = allModels.FirstOrDefault(m => m.FullPath == currentVoicePreset.ModelPath); - string previewValue = currentModel?.DisplayName ?? "Select a model..."; + + // PREVIEW: Uses the pre-parsed properties + string previewValue = currentModel != null + ? $"{currentModel.LanguageName} : {currentModel.DisplayName} ({currentModel.Quality})" + : "Select a model..."; if (ImGui.BeginCombo($"##ModelSelect{MemoizedId.Create()}", previewValue)) { - var qualities = new[] { "high", "medium", "low" }; + // Group by the LanguageName property we parsed from JSON + var languageGroups = allModels + .GroupBy(m => m.LanguageName) + .OrderBy(g => g.Key); - foreach (var quality in qualities) + foreach (var group in languageGroups) { - var modelsInSection = allModels.Where(m => m.Quality == quality).OrderBy(m => m.DisplayName).ToList(); + ImGui.Spacing(); + ImGui.TextDisabled($"--- {group.Key.ToUpper()} ---"); + ImGui.Separator(); - if (modelsInSection.Count > 0) + foreach (var model in group.OrderBy(m => m.DisplayName)) { - string headerText = quality switch - { - "high" => "HIGH Quality - 24khz - higher latency", - "medium" => "MEDIUM Quality - 22.5khz - mid latency", - "low" => "LOW Quality - 16khz - lower latency", - _ => quality.ToUpper() - }; + bool isSelected = currentVoicePreset.ModelPath == model.FullPath; - ImGui.Spacing(); - ImGui.TextDisabled(headerText); - ImGui.Separator(); + // Content: "Lessac (medium)" + string itemLabel = $"{model.LanguageName} : {model.DisplayName} ({model.Quality})"; - foreach (var model in modelsInSection) + if (ImGui.Selectable($"{itemLabel}##{model.FullPath}", isSelected)) { - bool isSelected = currentVoicePreset.ModelPath == model.FullPath; - - if (ImGui.Selectable($"{model.DisplayName}##{model.FullPath}", isSelected)) - { - currentVoicePreset.ModelPath = model.FullPath; - currentVoicePreset.InternalName = Path.GetFileNameWithoutExtension(model.FullPath); - config.Save(); - } - - if (isSelected) ImGui.SetItemDefaultFocus(); + currentVoicePreset.ModelPath = model.FullPath; + currentVoicePreset.InternalName = Path.GetFileNameWithoutExtension(model.FullPath); + config.Save(); } + + if (isSelected) ImGui.SetItemDefaultFocus(); } } ImGui.EndCombo(); From 17bd10345565e8ebaf9258bfda08d40b3ba0cd5e Mon Sep 17 00:00:00 2001 From: mitcheb0219 Date: Fri, 16 Jan 2026 13:28:24 -0500 Subject: [PATCH 08/13] Made Elevenlabs UI more resilient to model indexes. Updated Piper Dropdown --- .../Backends/ElevenLabs/ElevenLabsBackendUI.cs | 9 ++++++--- src/TextToTalk/Backends/Piper/PiperBackend.cs | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUI.cs b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUI.cs index 02a5b76c..76237023 100644 --- a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUI.cs +++ b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUI.cs @@ -150,18 +150,21 @@ public void DrawSettings() var modelDescriptionsList = modelDescriptions.Values.Select(v => v.Items.First()).ToList(); var selectedItemIndex = modelIdList.IndexOf(currentVoicePreset.ModelId); - string modelPreviewName = ""; - if (selectedItemIndex != -1) + string modelPreviewName = "Select a model..."; + bool previewHasStyles = false; + + if (selectedItemIndex >= 0 && selectedItemIndex < modelDescriptionsList.Count) { var selectedItem = modelDescriptionsList[selectedItemIndex]; modelPreviewName = $"{selectedItem.ModelId} || Cost Multiplier: {selectedItem.ModelRates["character_cost_multiplier"]}"; + if (currentVoicePreset.ModelId == "eleven_v3") { modelPreviewName += " [Styles Available]"; + previewHasStyles = true; } } - bool previewHasStyles = modelIdList[selectedItemIndex] == "eleven_v3"; string previewName = voiceIndex >= 0 ? $"{modelIdList[selectedItemIndex]} || Cost Multiplier: {modelDescriptionsList[selectedItemIndex].ModelRates["character_cost_multiplier"]}" : "Select a model..."; if (ImGui.BeginCombo($"Models##{MemoizedId.Create()}", "", ImGuiComboFlags.HeightLarge)) diff --git a/src/TextToTalk/Backends/Piper/PiperBackend.cs b/src/TextToTalk/Backends/Piper/PiperBackend.cs index e247ea3c..998b8c99 100644 --- a/src/TextToTalk/Backends/Piper/PiperBackend.cs +++ b/src/TextToTalk/Backends/Piper/PiperBackend.cs @@ -59,7 +59,7 @@ public async Task> GetAvailableModels() } public static bool IsModelFileDownloaded(PluginConfiguration config) { - var piperExePath = Path.Combine(config.GetPluginConfigDirectory(), "piper", "piper.exe"); + var piperExePath = Path.Combine(config.GetPluginConfigDirectory(), "piper", "piper", "piper.exe"); return File.Exists(piperExePath); } From 867d2efb9f8355614619c8fd49f0fa37cafe9f97 Mon Sep 17 00:00:00 2001 From: mitcheb0219 Date: Wed, 21 Jan 2026 10:17:57 -0500 Subject: [PATCH 09/13] Fixed up Voice Cancellation across all backends to work with streaming refactor --- src/TextToTalk/Backends/Azure/AzureClient.cs | 45 +++++-------------- .../Backends/ElevenLabs/ElevenLabsClient.cs | 12 ++--- .../Backends/GoogleCloud/GoogleCloudClient.cs | 22 ++------- .../Backends/Kokoro/KokoroSoundQueue.cs | 10 +---- .../Backends/OpenAI/OpenAiBackend.cs | 6 --- .../Backends/OpenAI/OpenAiBackendUI.cs | 14 ------ .../Backends/OpenAI/OpenAiClient.cs | 8 ---- src/TextToTalk/Backends/Piper/PiperBackend.cs | 8 ---- .../Backends/Piper/PiperBackendUI.cs | 31 +++---------- src/TextToTalk/Backends/Polly/PollyClient.cs | 7 +-- .../Backends/StreamingSoundQueue.cs | 23 +++++----- .../Backends/System/SystemSoundQueue.cs | 31 +++---------- .../Backends/Uberduck/UberduckBackend.cs | 10 +++++ .../Backends/Uberduck/UberduckBackendUI.cs | 2 - .../Backends/Uberduck/UberduckClient.cs | 32 +++++++------ 15 files changed, 72 insertions(+), 189 deletions(-) diff --git a/src/TextToTalk/Backends/Azure/AzureClient.cs b/src/TextToTalk/Backends/Azure/AzureClient.cs index d0fd168a..16c49827 100644 --- a/src/TextToTalk/Backends/Azure/AzureClient.cs +++ b/src/TextToTalk/Backends/Azure/AzureClient.cs @@ -49,16 +49,14 @@ public TextSource GetCurrentlySpokenTextSource() } public List GetVoicesWithStyles() { - // Fetches the voice result asynchronously and waits for completion var res = this.synthesizer.GetVoicesAsync().GetAwaiter().GetResult(); HandleResult(res); - // Maps each voice to a custom object containing Name and StyleList return res.Voices.Select(voice => new VoiceDetails { Name = voice.Name, ShortName = voice.ShortName, - Styles = voice.StyleList.ToList() // StyleList is a string[] + Styles = voice.StyleList.ToList() }).ToList(); } @@ -91,8 +89,7 @@ public async Task Say(string? voice, int playbackRate, float volume, TextSource Content = new StringContent(ssml, global::System.Text.Encoding.UTF8, "application/ssml+xml") }; - // 2026 Low Latency Format: 'raw' is better for direct streaming than 'riff' - request.Headers.Add("X-Microsoft-OutputFormat", "raw-16khz-16bit-mono-pcm"); + request.Headers.Add("X-Microsoft-OutputFormat", "raw-16khz-16bit-mono-pcm"); //Raw for lower latency using var response = await _httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, token); response.EnsureSuccessStatusCode(); @@ -114,25 +111,30 @@ public async Task Say(string? voice, int playbackRate, float volume, TextSource soundQueue.EnqueueSound(chunkStream, source, volume, StreamFormat.Azure, null, timestampToPass); } - - // Implicitly returns Task.CompletedTask because it is 'async Task' } public Task CancelAllSounds() { - //this.synthesizer.Dispose(); + if (this._ttsCts != null) + { + this._ttsCts.Cancel(); //signal cancellation if in progress + } + this.synthesizer.StopSpeakingAsync(); this.soundQueue.CancelAllSounds(); this.soundQueue.StopHardware(); - this.soundQueue.CancelAllSounds(); return Task.CompletedTask; } public Task CancelFromSource(TextSource source) { + if (this._ttsCts != null) + { + this._ttsCts.Cancel(); //signal cancellation if in progress + } this.synthesizer.StopSpeakingAsync(); - this.soundQueue.StopHardware(); this.soundQueue.CancelFromSource(source); + this.soundQueue.StopHardware(); return Task.CompletedTask; } @@ -144,29 +146,6 @@ private static void HandleResult(SynthesisVoicesResult res) } } - private static void HandleResult(SpeechSynthesisResult res) - { - if (res.Reason == ResultReason.Canceled) - { - var cancellation = SpeechSynthesisCancellationDetails.FromResult(res); - if (cancellation.Reason == CancellationReason.Error) - { - DetailedLog.Error($"Azure request error: ({cancellation.ErrorCode}) \"{cancellation.ErrorDetails}\""); - } - else - { - DetailedLog.Warn($"Azure request failed in state \"{cancellation.Reason}\""); - } - - return; - } - - if (res.Reason != ResultReason.SynthesizingAudioCompleted) - { - DetailedLog.Warn($"Speech synthesis request completed in incomplete state \"{res.Reason}\""); - } - } - public void Dispose() { this.synthesizer?.Dispose(); diff --git a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsClient.cs b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsClient.cs index aae4eae1..7b8448b0 100644 --- a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsClient.cs +++ b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsClient.cs @@ -74,7 +74,6 @@ public async Task Say(string? voice, int playbackRate, float volume, float simil var uriBuilder = new UriBuilder(UrlBase) { Path = $"/v1/text-to-speech/{voice}/stream" }; - // Use HttpCompletionOption.ResponseHeadersRead to begin processing before the body is fully downloaded using var req = new HttpRequestMessage(HttpMethod.Post, uriBuilder.Uri); AddAuthorization(req); req.Headers.Add("accept", "audio/mpeg"); @@ -82,22 +81,17 @@ public async Task Say(string? voice, int playbackRate, float volume, float simil using var content = new StringContent(JsonConvert.SerializeObject(args), Encoding.UTF8, "application/json"); req.Content = content; - // SendAsync with ResponseHeadersRead is the key for streaming - var res = await this.http.SendAsync(req, HttpCompletionOption.ResponseHeadersRead, ct); + var res = await this.http.SendAsync(req, HttpCompletionOption.ResponseHeadersRead, ct); // Using ResponseHeadersRead in order to stream as synth completes EnsureSuccessStatusCode(res); - // Get the stream directly from the response var responseStream = await res.Content.ReadAsStreamAsync(ct); long? timestampToPass = methodStart; - // Enqueue the live stream. - // IMPORTANT: Your soundQueue must be able to process the stream as bytes arrive. + this.soundQueue.EnqueueSound(responseStream, source, volume, StreamFormat.Mp3, res, timestampToPass); } catch (OperationCanceledException) { - // 2026 Best Practice: Catch the cancellation exception to prevent it - // from bubbling up as a generic error. - Log.Information("TTS generation was cancelled."); + Log.Information("TTS generation was cancelled."); // Catching cancellation } } diff --git a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs index 4d44b695..7c99da1e 100644 --- a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs +++ b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs @@ -38,16 +38,13 @@ public void Init(string pathToCredential) { if (client == null) return new Dictionary(); - // Fetch all available voices var response = client.ListVoices(""); var fetchedVoices = new Dictionary(); foreach (var voice in response.Voices) { - // Filter: Only include voices with "Chirp3" or "Chirp-HD" in their name - // Rebranded "Journey" voices also now fall under "Chirp-HD" - if (voice.Name.Contains("Chirp3") || voice.Name.Contains("Chirp-HD")) + if (voice.Name.Contains("Chirp3") || voice.Name.Contains("Chirp-HD")) // Focusing on Chirp 3 and Chirp HD voices as these are the only ones enabled for streaming. From what I can tell, this actually reduces duplicates of the same voice under different formats. { fetchedVoices.Add(voice.Name, new { @@ -92,10 +89,8 @@ public async Task Say(string? locale, string? voice, float? speed, float volume, try { - // 1. Open the stream with the cancellation token - using var streamingCall = client.StreamingSynthesize(); + using var streamingCall = client.StreamingSynthesize(); // One request to open the stream - // 2. FIRST request: Configuration ONLY var configRequest = new StreamingSynthesizeRequest { StreamingConfig = new StreamingSynthesizeConfig @@ -107,35 +102,26 @@ public async Task Say(string? locale, string? voice, float? speed, float volume, }, StreamingAudioConfig = new StreamingAudioConfig { - // Linear16 is the 2026 standard for Chirp 3 HD PCM streaming AudioEncoding = AudioEncoding.Pcm, SampleRateHertz = 24000, SpeakingRate = speed ?? 1.0f, } } }; - - // Pass token to WriteAsync to stop sending if cancelled await streamingCall.WriteAsync(configRequest); - // 3. SECOND request: Input Text ONLY - await streamingCall.WriteAsync(new StreamingSynthesizeRequest + await streamingCall.WriteAsync(new StreamingSynthesizeRequest // One request to send the text and write back the chunks { Input = new StreamingSynthesisInput { Text = text } }); await streamingCall.WriteCompleteAsync(); - // 4. Process the response stream with the cancellation token - await foreach (var response in streamingCall.GetResponseStream().WithCancellation(ct)) { if (response.AudioContent.Length > 0) { var chunkStream = new MemoryStream(response.AudioContent.ToByteArray()); - - // Note: Linear16 audio is typically handled as StreamFormat.Pcm - // but matches Wave if your queue expects raw headerless bytes. long? timestampToPass = methodStart; soundQueue.EnqueueSound(chunkStream, source, volume, StreamFormat.Wave, null, timestampToPass); } @@ -143,7 +129,7 @@ await streamingCall.WriteAsync(new StreamingSynthesizeRequest } catch (OperationCanceledException) { - // Handle normal cancellation (e.g., stopping the voice) + // Silent Cancellation if token is set to Cancelled } catch (Grpc.Core.RpcException ex) when (ex.StatusCode == Grpc.Core.StatusCode.Cancelled) { diff --git a/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs b/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs index f150d2d8..cd051bba 100644 --- a/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs +++ b/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs @@ -21,7 +21,6 @@ public class KokoroSoundQueue : SoundQueue private readonly PluginConfiguration config; private readonly Task modelTask; - // WASAPI Hardware Members private WasapiOut? soundOut; private BufferedWaveProvider? bufferedProvider; @@ -46,7 +45,6 @@ protected override void OnSoundLoop(KokoroSourceQueueItem nextItem) { if (!TryGetModel(out var model) || nextItem.Aborted) return; - // 1. Setup WASAPI Hardware Session lock (this.soundLock) { if (this.soundOut == null) @@ -63,7 +61,6 @@ protected override void OnSoundLoop(KokoroSourceQueueItem nextItem) } } - // 2. Prepare Language & Tokens string langCode = nextItem.Language switch { ClientLanguage.Japanese => "ja", @@ -75,12 +72,10 @@ protected override void OnSoundLoop(KokoroSourceQueueItem nextItem) int[] tokens = Tokenizer.Tokenize(nextItem.Text, langCode, preprocess: true); var segments = SegmentationSystem.SplitToSegments(tokens, new() { MaxFirstSegmentLength = 200 }); - // 3. Inference & Playback Loop foreach (var chunk in segments) { if (nextItem.Aborted) break; - // CPU Inference var samples = model.Infer(chunk, nextItem.Voice.Features, nextItem.Speed); byte[] bytes = KokoroPlayback.GetBytes(samples); @@ -97,7 +92,7 @@ protected override void OnSoundLoop(KokoroSourceQueueItem nextItem) if (nextItem.StartTime.HasValue) { var elapsed = Stopwatch.GetElapsedTime(nextItem.StartTime.Value); - Log.Information("Total Latency (Say -> Play): {Ms}ms", elapsed.TotalMilliseconds); + Log.Debug("Total Latency (Say -> Play): {Ms}", elapsed.TotalMilliseconds); } this.soundOut.Play(); } @@ -114,10 +109,8 @@ protected override void OnSoundLoop(KokoroSourceQueueItem nextItem) protected override void OnSoundCancelled() { - // 1. Flag the current item to stop the inference loop GetCurrentItem()?.Cancel(); - // 2. Hard Stop the WASAPI hardware session immediately StopHardware(); } @@ -163,7 +156,6 @@ protected override void Dispose(bool disposing) public void EnqueueSound(KokoroSourceQueueItem item) { - // Add the item to the internal SoundQueue processing loop this.AddQueueItem(item); } } diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs b/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs index 84ea7c62..3bf6d574 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs @@ -69,27 +69,22 @@ public override void Say(SayRequest request) public override void CancelAllSpeech() { - //Cancel at the queue this.uiModel.SoundQueue.CancelAllSounds(); - //Cancel at Speech Generation if (uiModel.OpenAi._ttsCts != null) { uiModel.OpenAi._ttsCts.Cancel(); uiModel.OpenAi._ttsCts.Dispose(); uiModel.OpenAi._ttsCts = null; } - //Cancel at Playback this.uiModel.SoundQueue.StopHardware(); } public override void CancelSay(TextSource source) { - //Cancel at the queue this.uiModel.SoundQueue.CancelFromSource(source); - //Cancel at Speech Generation if (uiModel.OpenAi._ttsCts != null) { uiModel.OpenAi._ttsCts.Cancel(); @@ -97,7 +92,6 @@ public override void CancelSay(TextSource source) uiModel.OpenAi._ttsCts = null; } - //Cancel at Playback if (uiModel.SoundQueue._ttsCts != null) { uiModel.OpenAi._ttsCts.Cancel(); diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiBackendUI.cs b/src/TextToTalk/Backends/OpenAI/OpenAiBackendUI.cs index 6f41588c..64841864 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiBackendUI.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiBackendUI.cs @@ -122,33 +122,25 @@ public void DrawVoicePresetOptions() if (currentVoicePreset.Model == null) return; var currentModel = OpenAiClient.Models.First(x => x.ModelName == currentVoicePreset.Model); - // 1. Determine what to display in the preview (the value corresponding to the current key) if (!currentModel.Voices.TryGetValue(currentVoicePreset.VoiceName ?? "", out var currentPreviewName)) { - // Fallback if current key is invalid or null currentVoicePreset.VoiceName = currentModel.Voices.Keys.First(); currentPreviewName = currentModel.Voices[currentVoicePreset.VoiceName]; config.Save(); } - // 2. Start the Combo Box with the Descriptive Value as the preview if (ImGui.BeginCombo($"Voice##{MemoizedId.Create()}", currentPreviewName)) { foreach (var voice in currentModel.Voices) { - // voice.Key is "alloy", "ash", etc. - // voice.Value is "Alloy (Neutral & Balanced)", etc. bool isSelected = (currentVoicePreset.VoiceName == voice.Key); - // 3. Display the descriptive Value to the user if (ImGui.Selectable(voice.Value, isSelected)) { - // 4. Update config with the underlying Key currentVoicePreset.VoiceName = voice.Key; config.Save(); } - // Standard ImGui accessibility: set focus to the selected item if (isSelected) { ImGui.SetItemDefaultFocus(); @@ -189,17 +181,14 @@ public void DrawVoicePresetOptions() } else { - // 1. Generate the preview text directly from the set string previewText = currentVoicePreset.Styles.Count > 0 ? string.Join(", ", currentVoicePreset.Styles) : "None selected"; - // 2. Open the Combo if (ImGui.BeginCombo($"Voice Style##{MemoizedId.Create()}", previewText)) { foreach (var styleName in config.CustomVoiceStyles) { - // Check if this style is currently in our preset's set bool isSelected = currentVoicePreset.Styles.Contains(styleName); if (ImGui.Selectable(styleName, isSelected, ImGuiSelectableFlags.DontClosePopups)) @@ -209,9 +198,6 @@ public void DrawVoicePresetOptions() else currentVoicePreset.Styles.Add(styleName); - // 3. Save immediately - // Because 'Styles' is a reference type inside the preset, - // the save/reload won't "wipe" your local UI state anymore. currentVoicePreset.SyncStringFromSet(); this.config.Save(); } diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiClient.cs b/src/TextToTalk/Backends/OpenAI/OpenAiClient.cs index c11b763a..1d510768 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiClient.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiClient.cs @@ -27,7 +27,6 @@ public class OpenAiClient private readonly HttpClient _httpClient = new(); - // --- Provided Definitions --- public record ModelConfig( string ModelName, IReadOnlyDictionary Voices, @@ -60,7 +59,6 @@ public record ModelConfig( public string? ApiKey { get; set; } - // --- Implementation --- public OpenAiClient(StreamingSoundQueue soundQueue, string apiKey) { _soundQueue = soundQueue; @@ -83,7 +81,6 @@ public async Task Say(string text, string modelName, TextSource source, string v try { - // 1. Prepare the JSON Payload var requestBody = new Dictionary { { "model", modelName }, @@ -100,20 +97,15 @@ public async Task Say(string text, string modelName, TextSource source, string v requestBody["instructions"] = instructions; } - // 2. Configure the Request using var request = new HttpRequestMessage(HttpMethod.Post, "https://api.openai.com/v1/audio/speech"); request.Headers.Authorization = new AuthenticationHeaderValue("Bearer", ApiKey); request.Content = new StringContent(JsonSerializer.Serialize(requestBody), Encoding.UTF8, "application/json"); - // 3. Send and Stream Response - // HttpCompletionOption.ResponseHeadersRead is the "magic" for low latency var response = await _httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, token); response.EnsureSuccessStatusCode(); var responseStream = await response.Content.ReadAsStreamAsync(token); - // 4. Pass the live stream directly to the sound queue - // The queue will handle the background reading/decoding _soundQueue.EnqueueSound(responseStream, source, volume, StreamFormat.Wave, null, methodStart); } catch (OperationCanceledException) diff --git a/src/TextToTalk/Backends/Piper/PiperBackend.cs b/src/TextToTalk/Backends/Piper/PiperBackend.cs index 998b8c99..66044f39 100644 --- a/src/TextToTalk/Backends/Piper/PiperBackend.cs +++ b/src/TextToTalk/Backends/Piper/PiperBackend.cs @@ -44,7 +44,6 @@ public PiperBackend(PluginConfiguration config) this.piper = new PiperProvider(new PiperConfiguration() { ExecutableLocation = piperExe, - // 2. Set the working directory to the folder containing the .exe WorkingDirectory = piperBaseDir }); @@ -74,10 +73,8 @@ public async Task DownloadSpecificModel(string modelKey, VoiceModel entry) try { DetailedLog.Info($"Downloading voice: {modelKey}"); - // Downloads into voicesDir/modelKey/ await entry.DownloadModel(voicesDir); - // Prepare the model.json for PiperSharp string onnxPath = Path.Combine(modelTargetDir, $"{modelKey}.onnx"); await LoadSpecificVoiceModel(onnxPath); @@ -101,7 +98,6 @@ public bool DeleteVoiceModel(string modelKey) if (Directory.Exists(modelTargetDir)) { - // Delete the folder and all contents (.onnx, .json) Directory.Delete(modelTargetDir, true); DetailedLog.Info($"Deleted voice model: {modelKey}"); return true; @@ -120,10 +116,8 @@ public async Task EnsurePiperAssetsDownloaded(PluginConfiguration config) string configDir = config.GetPluginConfigDirectory(); string voicesDir = GetVoicesDir(config); - // Ensure the executable is downloaded first await EnsureExecutableDownloaded(config); - // Fetch all available models from Hugging Face var allModels = await PiperDownloader.GetHuggingFaceModelList(); // TARGET: Only download "en_US-lessac-medium" initially @@ -140,12 +134,10 @@ public async Task EnsurePiperAssetsDownloaded(PluginConfiguration config) { DetailedLog.Info($"Downloading starter English voice: {starterModelKey}"); - // Downloads the model to voicesDir/en_US-lessac-medium/ await modelEntry.DownloadModel(voicesDir); string onnxPath = Path.Combine(modelTargetDir, $"{starterModelKey}.onnx"); - // Initialize the model config and directory await LoadSpecificVoiceModel(onnxPath); } catch (Exception ex) diff --git a/src/TextToTalk/Backends/Piper/PiperBackendUI.cs b/src/TextToTalk/Backends/Piper/PiperBackendUI.cs index e4e55ebe..85a66043 100644 --- a/src/TextToTalk/Backends/Piper/PiperBackendUI.cs +++ b/src/TextToTalk/Backends/Piper/PiperBackendUI.cs @@ -30,9 +30,9 @@ public class PiperBackendUI(PluginConfiguration config, PiperBackend piperBacken public class PiperModelInfo { public string FullPath { get; set; } - public string DisplayName { get; set; } // The dataset name (e.g., "Lessac") + public string DisplayName { get; set; } public string Quality { get; set; } - public string LanguageName { get; set; } // The pretty name (e.g., "English - US") + public string LanguageName { get; set; } public static PiperModelInfo FromPath(string onnxPath) { @@ -48,7 +48,6 @@ public static PiperModelInfo FromPath(string onnxPath) var langCode = root.GetProperty("language").GetProperty("code").GetString(); var langPlain = root.GetProperty("language").GetProperty("name_english").GetString(); - // Pass both code and plain name to your disambiguation helper var prettyLang = GetPrettyLanguageName(langCode, langPlain); var dataset = root.GetProperty("dataset").GetString(); @@ -81,7 +80,7 @@ public static string GetPrettyLanguageName(string code, string fallbackName) "es_mx" => "Spanish - MX", "nl_be" => "Dutch - BE", "nl_nl" => "Dutch - NL", - _ => fallbackName // Use "English", "French", etc. from JSON + _ => fallbackName }; } @@ -162,7 +161,6 @@ public void DrawVoicePresetOptions() config.Save(); } - // 1. REFACTORED MODEL SCANNING (Async polling for new downloads) if (!isScanning && (DateTime.Now - lastScan).TotalSeconds > 3) { lastScan = DateTime.Now; @@ -175,17 +173,13 @@ public void DrawVoicePresetOptions() var voicesDir = Path.Combine(config.GetPluginConfigDirectory(), "piper", "voices"); if (Directory.Exists(voicesDir)) { - // 1. Existing Model Scanning logic var files = Directory.GetFiles(voicesDir, "*.onnx", SearchOption.AllDirectories); cachedModels = files.Select(PiperModelInfo.FromPath).Where(m => m != null).ToList(); - // 2. NEW: Calculate Folder Size var dirInfo = new DirectoryInfo(voicesDir); - // Sum all files in all subdirectories long totalBytes = dirInfo.EnumerateFiles("*", SearchOption.AllDirectories).Sum(fi => fi.Length); - // Convert to MB and format - voicesFolderSize = $"{(totalBytes / 1024f / 1024f):N0} MB"; + voicesFolderSize = $"{(totalBytes / 1024f / 1024f):N0} MB"; // Display current size of Piper Voice Directory } } finally { isScanning = false; } @@ -203,14 +197,13 @@ public void DrawVoicePresetOptions() { var currentModel = allModels.FirstOrDefault(m => m.FullPath == currentVoicePreset.ModelPath); - // PREVIEW: Uses the pre-parsed properties string previewValue = currentModel != null ? $"{currentModel.LanguageName} : {currentModel.DisplayName} ({currentModel.Quality})" : "Select a model..."; if (ImGui.BeginCombo($"##ModelSelect{MemoizedId.Create()}", previewValue)) { - // Group by the LanguageName property we parsed from JSON + var languageGroups = allModels .GroupBy(m => m.LanguageName) .OrderBy(g => g.Key); @@ -225,7 +218,6 @@ public void DrawVoicePresetOptions() { bool isSelected = currentVoicePreset.ModelPath == model.FullPath; - // Content: "Lessac (medium)" string itemLabel = $"{model.LanguageName} : {model.DisplayName} ({model.Quality})"; if (ImGui.Selectable($"{itemLabel}##{model.FullPath}", isSelected)) @@ -276,14 +268,11 @@ public void DrawVoicePresetOptions() if (ImGui.Button($"Open Voice Downloader##{MemoizedId.Create()}")) { showDownloader = true; - // 2026 Best Practice: Fetch manifest on a background thread to prevent FFXIV frame drops Task.Run(async () => { try { - // Ensure this returns IDictionary or similar var models = await piperBackend.GetAvailableModels(); - // Cast or assign to your IDictionary field remoteModels = models.ToDictionary(k => k.Key, v => (VoiceModel)v.Value); } catch (Exception ex) @@ -299,7 +288,6 @@ public void DrawVoicePresetOptions() { ImGui.SetTooltip("Total disk space used by downloaded Piper voice models."); } - // Render the window if active if (showDownloader) DrawVoiceDownloader(); ImGui.Separator(); @@ -366,13 +354,11 @@ private void DrawVoiceDownloader() // Add a small delete button to the right of the "Installed" text ImGui.SameLine(ImGui.GetWindowWidth() - 40); - ImGui.PushStyleColor(ImGuiCol.Button, new global::System.Numerics.Vector4(0.6f, 0.2f, 0.2f, 1f)); // Reddish + ImGui.PushStyleColor(ImGuiCol.Button, new global::System.Numerics.Vector4(0.6f, 0.2f, 0.2f, 1f)); if (ImGui.Button("X##Delete")) { - // Trigger deletion if (piperBackend.DeleteVoiceModel(model.Key)) { - // Force a re-scan of the local files immediately so the UI updates lastScan = DateTime.MinValue; } } @@ -395,15 +381,10 @@ private void DrawVoiceDownloader() { activeDownloads.Add(model.Key); - // Use a Task.Run or Ensure the continuation happens correctly _ = piperBackend.DownloadSpecificModel(model.Key, (VoiceModel)model.Value) .ContinueWith(t => { - // 1. Force your scanner to see the new files immediately - // Assuming 'lastScan' is what triggers your cachedModels update loop lastScan = DateTime.MinValue; - - // 2. Remove from active downloads AFTER the scan is flagged activeDownloads.Remove(model.Key); }); } diff --git a/src/TextToTalk/Backends/Polly/PollyClient.cs b/src/TextToTalk/Backends/Polly/PollyClient.cs index eb607107..a5e4ac2d 100644 --- a/src/TextToTalk/Backends/Polly/PollyClient.cs +++ b/src/TextToTalk/Backends/Polly/PollyClient.cs @@ -85,20 +85,15 @@ public async Task Say(Engine engine, VoiceId voice, string? amazonDomainName, in bool isFirstChunk = true; try { - // Using 'using' ensures the response (and its stream) is disposed after the queue handles it var res = await this.client.SynthesizeSpeechAsync(req, ct); - // Pass the live AudioStream directly to the queue. - // Ensure EnqueueSound is updated to process the stream as it arrives. long? timestampToPass = isFirstChunk ? methodStart : null; this.soundQueue.EnqueueSound(res.AudioStream, source, volume, StreamFormat.Mp3, null, timestampToPass); isFirstChunk = false; } catch (OperationCanceledException) { - // 2026 Best Practice: Catch the cancellation exception to prevent it - // from bubbling up as a generic error. - Log.Information("TTS generation was cancelled."); + // Silently ignore cancellations } catch (Exception e) { diff --git a/src/TextToTalk/Backends/StreamingSoundQueue.cs b/src/TextToTalk/Backends/StreamingSoundQueue.cs index 03099585..b9767ae7 100644 --- a/src/TextToTalk/Backends/StreamingSoundQueue.cs +++ b/src/TextToTalk/Backends/StreamingSoundQueue.cs @@ -114,19 +114,22 @@ private void ProcessMp3Stream(StreamingSoundQueueItem nextItem) { lock (this.soundLock) { - - this.bufferedProvider.AddSamples(decompressedBuffer, 0, decompressedBytes); - - if (this.bufferedProvider.BufferedBytes > 4096 && - this.soundOut.PlaybackState != PlaybackState.Playing) + if (this.bufferedProvider != null && this.soundOut != null) { - if (nextItem.StartTime.HasValue) + this.bufferedProvider.AddSamples(decompressedBuffer, 0, decompressedBytes); + if (this.bufferedProvider.BufferedBytes > 4096 && + this.soundOut.PlaybackState != PlaybackState.Playing) { - var elapsed = Stopwatch.GetElapsedTime(nextItem.StartTime.Value); - Log.Information("Total Latency (Say -> Play): {Ms}ms", elapsed.TotalMilliseconds); + if (nextItem.StartTime.HasValue) + { + var elapsed = Stopwatch.GetElapsedTime(nextItem.StartTime.Value); + Log.Information("Total Latency (Say -> PlayMp3): {Ms}", elapsed.TotalMilliseconds); + } + this.soundOut.Play(); } - this.soundOut.Play(); } + + } } } @@ -173,7 +176,7 @@ private void ProcessRawPcmStream(StreamingSoundQueueItem nextItem) if (nextItem.StartTime.HasValue) { var elapsed = Stopwatch.GetElapsedTime(nextItem.StartTime.Value); - Log.Information("Total Latency (Say -> Processing): {Ms}ms", elapsed.TotalMilliseconds); + Log.Information("Total Latency (Say -> PlayPCM): {Ms}", elapsed.TotalMilliseconds); } this.soundOut.Play(); diff --git a/src/TextToTalk/Backends/System/SystemSoundQueue.cs b/src/TextToTalk/Backends/System/SystemSoundQueue.cs index a72921a1..cea1bb17 100644 --- a/src/TextToTalk/Backends/System/SystemSoundQueue.cs +++ b/src/TextToTalk/Backends/System/SystemSoundQueue.cs @@ -16,13 +16,11 @@ namespace TextToTalk.Backends.System { public class SystemSoundQueue : SoundQueue { - // WASAPI Hardware Members private WasapiOut? soundOut; private BufferedWaveProvider? bufferedProvider; private VolumeSampleProvider? volumeProvider; private readonly object soundLock = new(); - // 1. Unified Audio Configuration private static readonly WaveFormat SystemFormat = new(22050, 16, 1); private readonly SpeechSynthesizer _speechSynthesizer; private readonly LexiconManager _lexiconManager; @@ -87,16 +85,15 @@ protected override void OnSoundLoop(SystemSoundQueueItem nextItem) { if (nextItem.Preset is not SystemVoicePreset preset || nextItem.Aborted) return; - // 1. Mimic Kokoro: Shared Hardware Setup lock (this.soundLock) { if (this.soundOut == null) { var mmDevice = GetWasapiDeviceFromGuid(_config.SelectedAudioDeviceGuid); - // Match the voice's expected format (SAPI default is 22050Hz, 16-bit, Mono) + this.bufferedProvider = new BufferedWaveProvider(new WaveFormat(22050, 16, 1)) { - ReadFully = true, // Prevents WASAPI from stopping on empty buffer + ReadFully = true, BufferDuration = TimeSpan.FromSeconds(30) }; this.soundOut = new WasapiOut(mmDevice, AudioClientShareMode.Shared, false, 50); @@ -104,7 +101,6 @@ protected override void OnSoundLoop(SystemSoundQueueItem nextItem) } } - // 1.5 Instant Voice Switching via Pool if (!_synthPool.TryGetValue(preset.VoiceName, out var synth)) { synth = new SpeechSynthesizer(); @@ -115,7 +111,6 @@ protected override void OnSoundLoop(SystemSoundQueueItem nextItem) synth.Volume = preset.Volume; synth.Rate = preset.Rate; - // 2. Prepare Synthesis if (_speechSynthesizer.Voice.Name != preset.VoiceName) { _speechSynthesizer.SelectVoice(preset.VoiceName); @@ -125,14 +120,12 @@ protected override void OnSoundLoop(SystemSoundQueueItem nextItem) var ssml = _lexiconManager.MakeSsml(nextItem.Text, langCode: _speechSynthesizer.Voice.Culture.IetfLanguageTag); - // 3. Start Synthesis in Background (Feeding the buffer via bridge) + // Start Synthesis in Background (Feeding the buffer via bridge) using var bridge = new SynthesisBridgeStream(this.bufferedProvider!); _speechSynthesizer.SetOutputToWaveStream(bridge); - // Use SpeakAsync to avoid blocking the loop var synthPrompt = _speechSynthesizer.SpeakSsmlAsync(ssml); - // 4. This loop remains active as long as synthesis is running or audio is playing while (!nextItem.Aborted && (!synthPrompt.IsCompleted || this.bufferedProvider?.BufferedBytes > 44)) { if (this.bufferedProvider?.BufferedBytes > 512) // Pre-roll threshold @@ -149,8 +142,6 @@ protected override void OnSoundLoop(SystemSoundQueueItem nextItem) } } - // 5. Cleanup current item - this.StopHardware(); } @@ -158,7 +149,7 @@ protected override void OnSoundLoop(SystemSoundQueueItem nextItem) private class SynthesisBridgeStream : Stream { private readonly BufferedWaveProvider _target; - private int _bytesToSkip = 0; + private int _bytesToSkip = 44; private bool _headerSkipped = false; private long _position = 0; @@ -194,29 +185,25 @@ public override void Flush() { } protected override void OnSoundCancelled() { - // 1. Flag the current item to stop the inference loop GetCurrentItem()?.Cancel(); _speechSynthesizer.SpeakAsyncCancelAll(); - // 2. Hard Stop the WASAPI hardware session immediately StopHardware(); } public override void CancelAllSounds() { - // Check if disposed before accessing the synthesizer if (_isDisposed) return; try { _speechSynthesizer?.SpeakAsyncCancelAll(); } - catch (ObjectDisposedException) { /* Already gone, safe to ignore */ } + catch (ObjectDisposedException) { } StopHardware(); - // Call base after local cancellation logic base.CancelAllSounds(); } @@ -242,28 +229,24 @@ private void StopHardware() protected override void Dispose(bool disposing) { if (_isDisposed) return; - _isDisposed = true; // Signal all loops to stop immediately + _isDisposed = true; if (disposing) { try { - // Stop hardware first to release the audio device soundOut?.Stop(); - // Abort the synthesizer BEFORE calling base.Dispose _speechSynthesizer?.SpeakAsyncCancelAll(); _speechSynthesizer?.SetOutputToNull(); - // Give the background thread a very short window to exit gracefully - // rather than joining it indefinitely. } catch (Exception ex) { DetailedLog.Error(ex, "Error during early shutdown phase"); } - base.Dispose(disposing); // Clean up the queue thread + base.Dispose(disposing); _speechSynthesizer?.Dispose(); soundOut?.Dispose(); diff --git a/src/TextToTalk/Backends/Uberduck/UberduckBackend.cs b/src/TextToTalk/Backends/Uberduck/UberduckBackend.cs index dc306f4c..bbb97e5f 100644 --- a/src/TextToTalk/Backends/Uberduck/UberduckBackend.cs +++ b/src/TextToTalk/Backends/Uberduck/UberduckBackend.cs @@ -67,11 +67,21 @@ await this.uberduck.Say(uberduckVoicePreset.VoiceName, uberduckVoicePreset.Playb public override void CancelAllSpeech() { + if (uberduck._ttsCts != null) + { + uberduck._ttsCts.Cancel(); + } + this.soundQueue.StopHardware(); this.soundQueue.CancelAllSounds(); } public override void CancelSay(TextSource source) { + if (uberduck._ttsCts != null) + { + uberduck._ttsCts.Cancel(); + } + this.soundQueue.StopHardware(); this.soundQueue.CancelFromSource(source); } diff --git a/src/TextToTalk/Backends/Uberduck/UberduckBackendUI.cs b/src/TextToTalk/Backends/Uberduck/UberduckBackendUI.cs index 45805f84..a1cb4dce 100644 --- a/src/TextToTalk/Backends/Uberduck/UberduckBackendUI.cs +++ b/src/TextToTalk/Backends/Uberduck/UberduckBackendUI.cs @@ -52,10 +52,8 @@ public void DrawSettings(IConfigUIDelegates helpers) if (ImGui.Button($"Save and Login##{MemoizedId.Create()}")) { var apiKey = Whitespace.Replace(this.apiKey, ""); - //var password = Whitespace.Replace(this.apiSecret, ""); UberduckCredentialManager.SaveCredentials(apiKey); this.uberduck.ApiKey = apiKey; - //this.uberduck.ApiSecret = password; } ImGui.SameLine(); diff --git a/src/TextToTalk/Backends/Uberduck/UberduckClient.cs b/src/TextToTalk/Backends/Uberduck/UberduckClient.cs index 9d387850..43673831 100644 --- a/src/TextToTalk/Backends/Uberduck/UberduckClient.cs +++ b/src/TextToTalk/Backends/Uberduck/UberduckClient.cs @@ -1,4 +1,5 @@ -using Dalamud.Interface.Windowing; +using Dalamud.Game.ClientState.Fates; +using Dalamud.Interface.Windowing; using Newtonsoft.Json; using Serilog; using System; @@ -10,6 +11,7 @@ using System.Net.Http; using System.Text; using System.Text.RegularExpressions; +using System.Threading; using System.Threading.Tasks; using WindowsSystem = System.Net; @@ -21,6 +23,7 @@ public partial class UberduckClient private readonly HttpClient http; private readonly StreamingSoundQueue soundQueue; + public CancellationTokenSource? _ttsCts; public string? ApiKey { private get; set; } public string? ApiSecret { private get; set; } @@ -44,45 +47,46 @@ public UberduckClient(StreamingSoundQueue soundQueue, HttpClient http) Voices = new List(); } + // Uberduck TTS API call. They have moved to a versioned API so this section needed a pretty extensive re-work public async Task Say(string voice, int playbackRate, float volume, TextSource source, string text) { + long methodStart = Stopwatch.GetTimestamp(); + _ttsCts?.Cancel(); + _ttsCts = new CancellationTokenSource(); + var token = _ttsCts.Token; var url = "https://api.uberduck.ai/v1/text-to-speech"; var payload = new { text = text, voice = voice, - output_format = "wav" // CRITICAL: Forces the API to return a WAV file instead of MP3 + output_format = "wav" }; using var request = new HttpRequestMessage(HttpMethod.Post, url); request.Headers.Authorization = new WindowsSystem.Http.Headers.AuthenticationHeaderValue("Bearer", ApiKey); request.Content = new StringContent(JsonConvert.SerializeObject(payload), Encoding.UTF8, "application/json"); - var response = await this.http.SendAsync(request); + var response = await this.http.SendAsync(request, token); if (response.IsSuccessStatusCode) { - var json = await response.Content.ReadAsStringAsync(); + var json = await response.Content.ReadAsStringAsync(token); var result = JsonConvert.DeserializeObject(json); - if (result?.AudioUrl != null) + if (result?.AudioUrl != null && !token.IsCancellationRequested) { - // Download the actual audio data + var audioBytes = await this.http.GetByteArrayAsync(result.AudioUrl); - // Use a MemoryStream to hold the downloaded data var waveStream = new MemoryStream(audioBytes); long? timestampToPass = methodStart; - // Pass the stream to your queue. Ensure the consumer uses WaveFileReader - // to correctly handle the WAV container. + this.soundQueue.EnqueueSound(waveStream, source, volume, StreamFormat.Uberduck, null, timestampToPass); } } } - - // 2026 Response Model public class UberduckTtsResponse { [JsonProperty("audio_url")] @@ -109,7 +113,6 @@ private async Task GetUuidForVoice(string voice) return voiceInfo.VoiceModelUuid; } - // 1. Change return type to Task>> public async Task>> UpdateVoices() { Log.Information("Updating Voices..."); @@ -125,10 +128,8 @@ public async Task>> UpdateVoices() var json = await response.Content.ReadAsStringAsync(); var result = JsonConvert.DeserializeObject(json); - // Update local state if needed this.Voices = result?.Voices ?? new List(); - // Update the cached variable before returning this.CachedVoices = this.Voices .OrderBy(v => v.DisplayName) .GroupBy(v => v.Category ?? "Uncategorized") @@ -144,7 +145,6 @@ public async Task>> UpdateVoices() } } - // Return empty dictionary if authorization fails or request fails return new Dictionary>(); } @@ -199,8 +199,6 @@ private static string GetRequestFailureDetail(string resContent) private void AddAuthorization(HttpRequestMessage req) { - // 2026 standard uses Bearer token with the API Key - // Ensure your ApiKey is the "Public Key" or "API Key" from the Uberduck dashboard req.Headers.Authorization = new global::System.Net.Http.Headers.AuthenticationHeaderValue("Bearer", ApiKey); } From bcd063b549c2f7bd732f79cdb6bdf95ae7911758 Mon Sep 17 00:00:00 2001 From: mitcheb0219 Date: Wed, 21 Jan 2026 14:25:57 -0500 Subject: [PATCH 10/13] Added "/tttstats" so user can track average latency for TTS --- src/TextToTalk/Backends/Azure/AzureBackend.cs | 9 ++- .../Backends/Azure/AzureBackendUIModel.cs | 7 ++- src/TextToTalk/Backends/Azure/AzureClient.cs | 4 +- .../Backends/ElevenLabs/ElevenLabsBackend.cs | 5 +- .../ElevenLabs/ElevenLabsBackendUIModel.cs | 4 +- .../GoogleCloud/GoogleCloudBackend.cs | 5 +- .../Backends/Kokoro/KokoroBackend.cs | 6 +- .../Backends/Kokoro/KokoroSoundQueue.cs | 5 +- .../Backends/OpenAI/OpenAiBackend.cs | 5 +- .../Backends/OpenAI/OpenAiBackendUIModel.cs | 4 +- src/TextToTalk/Backends/Piper/PiperBackend.cs | 6 +- src/TextToTalk/Backends/Polly/PollyBackend.cs | 7 ++- .../Backends/Polly/PollyBackendUI.cs | 8 ++- .../Backends/Polly/PollyBackendUIModel.cs | 12 ++-- src/TextToTalk/Backends/Polly/PollyClient.cs | 4 +- .../Backends/StreamingSoundQueue.cs | 12 +++- .../Backends/System/SystemBackend.cs | 6 +- .../Backends/System/SystemSoundQueue.cs | 8 ++- .../Backends/Uberduck/UberduckBackend.cs | 6 +- .../Backends/VoiceBackendManager.cs | 25 +++++---- .../Backends/Websocket/WebsocketBackend.cs | 5 +- .../CommandModules/MainCommandModule.cs | 11 +++- src/TextToTalk/TextToTalk.cs | 55 ++++++++++++++++++- src/TextToTalk/UI/Windows/StatsWindow.cs | 51 +++++++++++++++++ 24 files changed, 207 insertions(+), 63 deletions(-) create mode 100644 src/TextToTalk/UI/Windows/StatsWindow.cs diff --git a/src/TextToTalk/Backends/Azure/AzureBackend.cs b/src/TextToTalk/Backends/Azure/AzureBackend.cs index a6b7c266..48bb182e 100644 --- a/src/TextToTalk/Backends/Azure/AzureBackend.cs +++ b/src/TextToTalk/Backends/Azure/AzureBackend.cs @@ -1,9 +1,7 @@ using Dalamud.Bindings.ImGui; -using FFXIVClientStructs.FFXIV.Client.Game.UI; using System; using System.Collections.Generic; using System.Net.Http; -using TextToTalk.Backends.ElevenLabs; using static TextToTalk.Backends.Azure.AzureClient; namespace TextToTalk.Backends.Azure; @@ -13,18 +11,19 @@ public class AzureBackend : VoiceBackend private readonly AzureBackendUI ui; private readonly AzureBackendUIModel uiModel; public List voices; + private readonly LatencyTracker latencyTracker; - public AzureBackend(PluginConfiguration config, HttpClient http) + public AzureBackend(PluginConfiguration config, HttpClient http, LatencyTracker latencyTracker) { TitleBarColor = ImGui.ColorConvertU32ToFloat4(0xFFF96800); var lexiconManager = new DalamudLexiconManager(); LexiconUtils.LoadFromConfigAzure(lexiconManager, config); - this.uiModel = new AzureBackendUIModel(config, lexiconManager); + this.uiModel = new AzureBackendUIModel(config, lexiconManager, latencyTracker); this.voices = this.uiModel.voices; this.ui = new AzureBackendUI(this.uiModel, config, lexiconManager, http, this); - + this.latencyTracker = latencyTracker; } public override void DrawStyles(IConfigUIDelegates helpers) diff --git a/src/TextToTalk/Backends/Azure/AzureBackendUIModel.cs b/src/TextToTalk/Backends/Azure/AzureBackendUIModel.cs index c86d14a4..c219b830 100644 --- a/src/TextToTalk/Backends/Azure/AzureBackendUIModel.cs +++ b/src/TextToTalk/Backends/Azure/AzureBackendUIModel.cs @@ -16,6 +16,8 @@ public class AzureBackendUIModel public List voices; private AzureLoginInfo loginInfo; + private readonly LatencyTracker latencyTracker; + /// /// Gets the currently-instantiated Azure client instance. /// @@ -31,13 +33,14 @@ public class AzureBackendUIModel /// public IReadOnlyList Voices => this.voices; - public AzureBackendUIModel(PluginConfiguration config, LexiconManager lexiconManager) + public AzureBackendUIModel(PluginConfiguration config, LexiconManager lexiconManager, LatencyTracker latencyTracker) { this.config = config; this.lexiconManager = lexiconManager; this.voices = new List(); this.loginInfo = new AzureLoginInfo(); + this.latencyTracker = latencyTracker; var credentials = AzureCredentialManager.LoadCredentials(); if (credentials != null) { @@ -97,7 +100,7 @@ private bool TryAzureLogin() try { DetailedLog.Info($"Logging into Azure region {this.loginInfo.Region}"); - Azure = new AzureClient(this.loginInfo.SubscriptionKey, this.loginInfo.Region, this.lexiconManager, this.config); + Azure = new AzureClient(this.loginInfo.SubscriptionKey, this.loginInfo.Region, this.lexiconManager, this.config, this.latencyTracker); // This should throw an exception if the login failed this.voices = Azure.GetVoicesWithStyles(); return true; diff --git a/src/TextToTalk/Backends/Azure/AzureClient.cs b/src/TextToTalk/Backends/Azure/AzureClient.cs index 16c49827..be1df8e3 100644 --- a/src/TextToTalk/Backends/Azure/AzureClient.cs +++ b/src/TextToTalk/Backends/Azure/AzureClient.cs @@ -27,7 +27,7 @@ public class AzureClient : IDisposable private readonly PluginConfiguration config; private CancellationTokenSource? _ttsCts; - public AzureClient(string subscriptionKey, string region, LexiconManager lexiconManager, PluginConfiguration config) + public AzureClient(string subscriptionKey, string region, LexiconManager lexiconManager, PluginConfiguration config, LatencyTracker latencyTracker) { _apiKey = subscriptionKey; _endpoint = $"https://{region}.tts.speech.microsoft.com/cognitiveservices/v1"; @@ -36,7 +36,7 @@ public AzureClient(string subscriptionKey, string region, LexiconManager lexicon _httpClient.DefaultRequestHeaders.Add("Ocp-Apim-Subscription-Key", _apiKey); _httpClient.DefaultRequestHeaders.Add("User-Agent", "TextToTalkApp"); - soundQueue = new StreamingSoundQueue(config); + soundQueue = new StreamingSoundQueue(config, latencyTracker); _lexiconManager = lexiconManager; speechConfig = SpeechConfig.FromSubscription(subscriptionKey, region); speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm); diff --git a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackend.cs b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackend.cs index ef00d2f4..ae953d07 100644 --- a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackend.cs +++ b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackend.cs @@ -13,10 +13,11 @@ public class ElevenLabsBackend : VoiceBackend private readonly ElevenLabsBackendUIModel uiModel; private readonly INotificationService notificationService; private readonly PluginConfiguration config; + private readonly LatencyTracker latencyTracker; - public ElevenLabsBackend(PluginConfiguration config, HttpClient http, INotificationService notificationService) + public ElevenLabsBackend(PluginConfiguration config, HttpClient http, INotificationService notificationService, LatencyTracker latencyTracker) { - this.uiModel = new ElevenLabsBackendUIModel(config, http); + this.uiModel = new ElevenLabsBackendUIModel(config, http, latencyTracker); this.ui = new ElevenLabsBackendUI(uiModel, config, this); this.notificationService = notificationService; this.config = config; diff --git a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUIModel.cs b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUIModel.cs index 38368c32..fac7bf8c 100644 --- a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUIModel.cs +++ b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUIModel.cs @@ -44,9 +44,9 @@ public class ElevenLabsBackendUIModel : IDisposable public IReadOnlyDictionary> Voices { get; private set; } public IReadOnlyDictionary Items, Dictionary? Rates)> Models { get; private set; } - public ElevenLabsBackendUIModel(PluginConfiguration config, HttpClient http) + public ElevenLabsBackendUIModel(PluginConfiguration config, HttpClient http, LatencyTracker latencyTracker) { - SoundQueue = new StreamingSoundQueue(config); + SoundQueue = new StreamingSoundQueue(config, latencyTracker); ElevenLabs = new ElevenLabsClient(SoundQueue, http); this.config = config; this.getUserSubscriptionInfoImmediately = new ReactiveProperty(0); diff --git a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackend.cs b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackend.cs index 898ad597..e337c0b6 100644 --- a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackend.cs +++ b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackend.cs @@ -8,10 +8,11 @@ public class GoogleCloudBackend : VoiceBackend private readonly GoogleCloudClient client; private readonly StreamingSoundQueue soundQueue; private readonly GoogleCloudBackendUI ui; + private readonly LatencyTracker latencyTracker; - public GoogleCloudBackend(PluginConfiguration config) + public GoogleCloudBackend(PluginConfiguration config, LatencyTracker latencyTracker) { - soundQueue = new StreamingSoundQueue(config); + soundQueue = new StreamingSoundQueue(config, latencyTracker); client = new GoogleCloudClient(soundQueue, config.GoogleCreds); ui = new GoogleCloudBackendUI(config, client, this); } diff --git a/src/TextToTalk/Backends/Kokoro/KokoroBackend.cs b/src/TextToTalk/Backends/Kokoro/KokoroBackend.cs index c146e18b..efe22e76 100644 --- a/src/TextToTalk/Backends/Kokoro/KokoroBackend.cs +++ b/src/TextToTalk/Backends/Kokoro/KokoroBackend.cs @@ -21,18 +21,20 @@ public class KokoroBackend : VoiceBackend private readonly KokoroBackendUI ui; private readonly Task modelTask; private readonly CancellationTokenSource cts = new(); + private readonly LatencyTracker latencyTracker; - public KokoroBackend(PluginConfiguration config) + public KokoroBackend(PluginConfiguration config, LatencyTracker latencyTracker) { ui = new KokoroBackendUI(config, this); Tokenizer.eSpeakNGPath = Path.Join(config.GetPluginAssemblyDirectory(), "espeak"); modelTask = GetModelAsync(config); - soundQueue = new KokoroSoundQueue(config, modelTask); + soundQueue = new KokoroSoundQueue(config, modelTask, latencyTracker); KokoroVoiceManager.LoadVoicesFromPath(Path.Join(config.GetPluginAssemblyDirectory(), "voices")); DetailedLog.Info($"Kokoro voices loaded: {KokoroVoiceManager.Voices.Count} voices available."); + this.latencyTracker = latencyTracker; } /// diff --git a/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs b/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs index cd051bba..d0feae14 100644 --- a/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs +++ b/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs @@ -20,14 +20,16 @@ public class KokoroSoundQueue : SoundQueue private readonly object soundLock = new(); private readonly PluginConfiguration config; private readonly Task modelTask; + private readonly LatencyTracker latencyTracker; private WasapiOut? soundOut; private BufferedWaveProvider? bufferedProvider; - public KokoroSoundQueue(PluginConfiguration config, Task modelTask) + public KokoroSoundQueue(PluginConfiguration config, Task modelTask, LatencyTracker latencyTracker) { this.config = config; this.modelTask = modelTask; + this.latencyTracker = latencyTracker; } private bool TryGetModel([NotNullWhen(true)] out KokoroModel? model) @@ -92,6 +94,7 @@ protected override void OnSoundLoop(KokoroSourceQueueItem nextItem) if (nextItem.StartTime.HasValue) { var elapsed = Stopwatch.GetElapsedTime(nextItem.StartTime.Value); + this.latencyTracker.AddLatency(elapsed.TotalMilliseconds); Log.Debug("Total Latency (Say -> Play): {Ms}", elapsed.TotalMilliseconds); } this.soundOut.Play(); diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs b/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs index 3bf6d574..3477cfb9 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs @@ -15,10 +15,11 @@ public class OpenAiBackend : VoiceBackend private readonly OpenAiBackendUI ui; private readonly OpenAiBackendUIModel uiModel; private readonly INotificationService notificationService; + private readonly LatencyTracker latencyTracker; - public OpenAiBackend(PluginConfiguration config, HttpClient http, INotificationService notificationService) + public OpenAiBackend(PluginConfiguration config, HttpClient http, INotificationService notificationService, LatencyTracker latencyTracker) { - this.uiModel = new OpenAiBackendUIModel(config, http); + this.uiModel = new OpenAiBackendUIModel(config, http, latencyTracker); this.ui = new OpenAiBackendUI(uiModel, config, this); this.notificationService = notificationService; } diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiBackendUIModel.cs b/src/TextToTalk/Backends/OpenAI/OpenAiBackendUIModel.cs index 1d8d0113..156a1b51 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiBackendUIModel.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiBackendUIModel.cs @@ -39,9 +39,9 @@ public class OpenAiBackendUIModel /// // public IReadOnlyDictionary> Voices { get; private set; } - public OpenAiBackendUIModel(PluginConfiguration config, HttpClient http) + public OpenAiBackendUIModel(PluginConfiguration config, HttpClient http, LatencyTracker latencyTracker) { - SoundQueue = new StreamingSoundQueue(config); + SoundQueue = new StreamingSoundQueue(config, latencyTracker); var credentials = OpenAiCredentialManager.LoadCredentials(); if (credentials != null) { diff --git a/src/TextToTalk/Backends/Piper/PiperBackend.cs b/src/TextToTalk/Backends/Piper/PiperBackend.cs index 66044f39..ad5d740a 100644 --- a/src/TextToTalk/Backends/Piper/PiperBackend.cs +++ b/src/TextToTalk/Backends/Piper/PiperBackend.cs @@ -29,11 +29,12 @@ public class PiperBackend : VoiceBackend private Process? piperServerProcess; private readonly object processLock = new(); + private readonly LatencyTracker latencyTracker; public string GetVoicesDir(PluginConfiguration config) => Path.Combine(config.GetPluginConfigDirectory(), "piper", "voices"); - public PiperBackend(PluginConfiguration config) + public PiperBackend(PluginConfiguration config, LatencyTracker latencyTracker) { this.ui = new PiperBackendUI(config, this); @@ -48,8 +49,9 @@ public PiperBackend(PluginConfiguration config) }); this.modelTask = LoadOrDownloadModelAsync(config); - this.soundQueue = new StreamingSoundQueue(config); + this.soundQueue = new StreamingSoundQueue(config, latencyTracker); this.config = config; + this.latencyTracker = latencyTracker; } public async Task> GetAvailableModels() diff --git a/src/TextToTalk/Backends/Polly/PollyBackend.cs b/src/TextToTalk/Backends/Polly/PollyBackend.cs index 7042ccb4..ea573ac0 100644 --- a/src/TextToTalk/Backends/Polly/PollyBackend.cs +++ b/src/TextToTalk/Backends/Polly/PollyBackend.cs @@ -9,17 +9,18 @@ public class PollyBackend : VoiceBackend { private readonly PollyBackendUI ui; private readonly PollyBackendUIModel uiModel; + private readonly LatencyTracker latencyTracker; - public PollyBackend(PluginConfiguration config, HttpClient http) + public PollyBackend(PluginConfiguration config, HttpClient http, LatencyTracker latencyTracker) { TitleBarColor = ImGui.ColorConvertU32ToFloat4(0xFF0099FF); var lexiconManager = new DalamudLexiconManager(); - this.uiModel = new PollyBackendUIModel(config, lexiconManager); + this.uiModel = new PollyBackendUIModel(config, lexiconManager, latencyTracker); LexiconUtils.LoadFromConfigPolly(lexiconManager, config); - this.ui = new PollyBackendUI(this.uiModel, config, lexiconManager, http, this); + this.ui = new PollyBackendUI(this.uiModel, config, lexiconManager, http, this, latencyTracker); } public override void DrawStyles(IConfigUIDelegates helpers) diff --git a/src/TextToTalk/Backends/Polly/PollyBackendUI.cs b/src/TextToTalk/Backends/Polly/PollyBackendUI.cs index 8fc81505..c0918306 100644 --- a/src/TextToTalk/Backends/Polly/PollyBackendUI.cs +++ b/src/TextToTalk/Backends/Polly/PollyBackendUI.cs @@ -2,6 +2,7 @@ using Dalamud.Bindings.ImGui; using Dalamud.Game; using Dalamud.Game.Text; +using FFXIVClientStructs; using System; using System.IO; using System.Linq; @@ -19,12 +20,13 @@ public class PollyBackendUI private readonly LexiconComponent lexiconComponent; private readonly PollyBackendUIModel model; private readonly PollyBackend backend; + private readonly LatencyTracker latencyTracker; private string accessKey; private string secretKey; public PollyBackendUI(PollyBackendUIModel model, PluginConfiguration config, LexiconManager lexiconManager, - HttpClient http, PollyBackend backend) + HttpClient http, PollyBackend backend, LatencyTracker latencyTracker) { this.model = model; @@ -39,7 +41,9 @@ public PollyBackendUI(PollyBackendUIModel model, PluginConfiguration config, Lex new LexiconComponent(lexiconManager, lexiconRepository, config, () => config.PollyLexiconFiles); (this.accessKey, this.secretKey) = this.model.GetKeyPair(); + this.latencyTracker = latencyTracker; } + public void DrawSettings(IConfigUIDelegates helpers) { @@ -58,7 +62,7 @@ public void DrawSettings(IConfigUIDelegates helpers) if (ImGui.Button($"Save and Login##{MemoizedId.Create()}")) { - this.model.LoginWith(this.accessKey, this.secretKey); + this.model.LoginWith(this.accessKey, this.secretKey, this.latencyTracker); } ImGui.SameLine(); diff --git a/src/TextToTalk/Backends/Polly/PollyBackendUIModel.cs b/src/TextToTalk/Backends/Polly/PollyBackendUIModel.cs index 9bef8aa4..52135238 100644 --- a/src/TextToTalk/Backends/Polly/PollyBackendUIModel.cs +++ b/src/TextToTalk/Backends/Polly/PollyBackendUIModel.cs @@ -46,7 +46,7 @@ public class PollyBackendUIModel : IDisposable /// public string[] Engines { get; } = { Engine.Neural, Engine.Standard, Engine.Generative, Engine.LongForm }; - public PollyBackendUIModel(PluginConfiguration config, LexiconManager lexiconManager) + public PollyBackendUIModel(PluginConfiguration config, LexiconManager lexiconManager, LatencyTracker latencyTracker) { this.config = config; this.lexiconManager = lexiconManager; @@ -59,7 +59,7 @@ public PollyBackendUIModel(PluginConfiguration config, LexiconManager lexiconMan this.keyPair.AccessKey = credentials.UserName; this.keyPair.SecretKey = credentials.Password; - TryPollyLogin(GetCurrentRegion()); + TryPollyLogin(GetCurrentRegion(), latencyTracker); } } @@ -75,13 +75,13 @@ public PollyKeyPair GetKeyPair() /// /// The client's access key. /// The client's secret access key. - public void LoginWith(string accessKey, string secretKey) + public void LoginWith(string accessKey, string secretKey, LatencyTracker latencyTracker) { var username = Whitespace.Replace(accessKey, ""); var password = Whitespace.Replace(secretKey, ""); this.keyPair = new PollyKeyPair { AccessKey = username, SecretKey = password }; - if (TryPollyLogin(GetCurrentRegion())) + if (TryPollyLogin(GetCurrentRegion(), latencyTracker)) { // Only save the user's new credentials if the login succeeded PollyCredentialManager.SaveCredentials(username, password); @@ -173,7 +173,7 @@ public void SetCurrentEngine(Engine engine) this.config.Save(); } - private bool TryPollyLogin(RegionEndpoint regionEndpoint) + private bool TryPollyLogin(RegionEndpoint regionEndpoint, LatencyTracker latencyTracker) { PollyLoginException = null; Polly?.Dispose(); @@ -181,7 +181,7 @@ private bool TryPollyLogin(RegionEndpoint regionEndpoint) { DetailedLog.Info($"Logging into AWS region {regionEndpoint}"); Polly = new PollyClient(this.keyPair.AccessKey, this.keyPair.SecretKey, regionEndpoint, - this.lexiconManager, this.config); + this.lexiconManager, this.config, latencyTracker); var currentVoicePreset = this.config.GetCurrentVoicePreset(); // This should throw an exception if the login credentials were incorrect this.voices = Polly.GetVoicesForEngine(currentVoicePreset?.VoiceEngine ?? Engine.Neural); diff --git a/src/TextToTalk/Backends/Polly/PollyClient.cs b/src/TextToTalk/Backends/Polly/PollyClient.cs index a5e4ac2d..421d98c2 100644 --- a/src/TextToTalk/Backends/Polly/PollyClient.cs +++ b/src/TextToTalk/Backends/Polly/PollyClient.cs @@ -22,11 +22,11 @@ public class PollyClient : IDisposable public CancellationTokenSource? _TtsCts; - public PollyClient(string accessKey, string secretKey, RegionEndpoint region, LexiconManager lexiconManager, PluginConfiguration config) + public PollyClient(string accessKey, string secretKey, RegionEndpoint region, LexiconManager lexiconManager, PluginConfiguration config, LatencyTracker latencyTracker) { var credentials = new BasicAWSCredentials(accessKey, secretKey); this.client = new AmazonPollyClient(credentials, region); - this.soundQueue = new StreamingSoundQueue(config); + this.soundQueue = new StreamingSoundQueue(config, latencyTracker); this.lexiconManager = lexiconManager; } diff --git a/src/TextToTalk/Backends/StreamingSoundQueue.cs b/src/TextToTalk/Backends/StreamingSoundQueue.cs index b9767ae7..69716af7 100644 --- a/src/TextToTalk/Backends/StreamingSoundQueue.cs +++ b/src/TextToTalk/Backends/StreamingSoundQueue.cs @@ -7,6 +7,7 @@ using System.Collections.Generic; using System.Diagnostics; using System.IO; +using System.Linq; using System.Net.Http; using System.Net.Sockets; using System.Speech.Synthesis; @@ -19,7 +20,7 @@ namespace TextToTalk.Backends { - public class StreamingSoundQueue(PluginConfiguration config) : SoundQueue + public class StreamingSoundQueue(PluginConfiguration config, LatencyTracker latencyTracker) : SoundQueue { // WASAPI Hardware Members private WasapiOut? soundOut; @@ -36,6 +37,8 @@ public class StreamingSoundQueue(PluginConfiguration config) : SoundQueue PlayMp3): {Ms}", elapsed.TotalMilliseconds); + latencyTracker.AddLatency(elapsed.TotalMilliseconds); + Log.Debug("Total Latency (Say -> PlayMp3): {Ms}", elapsed.TotalMilliseconds); } this.soundOut.Play(); } @@ -176,7 +180,8 @@ private void ProcessRawPcmStream(StreamingSoundQueueItem nextItem) if (nextItem.StartTime.HasValue) { var elapsed = Stopwatch.GetElapsedTime(nextItem.StartTime.Value); - Log.Information("Total Latency (Say -> PlayPCM): {Ms}", elapsed.TotalMilliseconds); + latencyTracker.AddLatency(elapsed.TotalMilliseconds); + Log.Debug("Total Latency (Say -> PlayPCM): {Ms}", elapsed.TotalMilliseconds); } this.soundOut.Play(); @@ -187,6 +192,7 @@ private void ProcessRawPcmStream(StreamingSoundQueueItem nextItem) nextItem.Data.Dispose(); } + private void EnsureHardwareInitialized(WaveFormat format) { if (this.soundOut == null || !this.bufferedProvider.WaveFormat.Equals(format)) diff --git a/src/TextToTalk/Backends/System/SystemBackend.cs b/src/TextToTalk/Backends/System/SystemBackend.cs index 0df3f02c..4e7244c1 100644 --- a/src/TextToTalk/Backends/System/SystemBackend.cs +++ b/src/TextToTalk/Backends/System/SystemBackend.cs @@ -12,9 +12,10 @@ public class SystemBackend : VoiceBackend private readonly SystemBackendUI ui; private readonly SystemSoundQueue soundQueue; private readonly IDisposable voiceExceptions; + private readonly LatencyTracker latencyTracker; - public SystemBackend(PluginConfiguration config, HttpClient http) + public SystemBackend(PluginConfiguration config, HttpClient http, LatencyTracker latencyTracker) { var lexiconManager = new DalamudLexiconManager(); LexiconUtils.LoadFromConfigSystem(lexiconManager, config); @@ -22,8 +23,9 @@ public SystemBackend(PluginConfiguration config, HttpClient http) this.uiModel = new SystemBackendUIModel(); this.ui = new SystemBackendUI(this.uiModel, config, lexiconManager, http, this); - this.soundQueue = new SystemSoundQueue(lexiconManager, config); + this.soundQueue = new SystemSoundQueue(lexiconManager, config, latencyTracker); this.voiceExceptions = this.uiModel.SubscribeToVoiceExceptions(this.soundQueue.SelectVoiceFailed); + this.latencyTracker = latencyTracker; } public override void DrawStyles(IConfigUIDelegates helpers) diff --git a/src/TextToTalk/Backends/System/SystemSoundQueue.cs b/src/TextToTalk/Backends/System/SystemSoundQueue.cs index cea1bb17..092fea96 100644 --- a/src/TextToTalk/Backends/System/SystemSoundQueue.cs +++ b/src/TextToTalk/Backends/System/SystemSoundQueue.cs @@ -25,6 +25,7 @@ public class SystemSoundQueue : SoundQueue private readonly SpeechSynthesizer _speechSynthesizer; private readonly LexiconManager _lexiconManager; private readonly PluginConfiguration _config; + private readonly LatencyTracker _latencyTracker; private readonly Subject _selectVoiceFailed = new(); @@ -55,12 +56,12 @@ public void EnqueueSound(VoicePreset preset, TextSource source, string text, lon }); } - public SystemSoundQueue(LexiconManager lexiconManager, PluginConfiguration config) + public SystemSoundQueue(LexiconManager lexiconManager, PluginConfiguration config, LatencyTracker latencyTracker) { _lexiconManager = lexiconManager; _config = config; _speechSynthesizer = new SpeechSynthesizer(); - + _latencyTracker = latencyTracker; } @@ -135,7 +136,8 @@ protected override void OnSoundLoop(SystemSoundQueueItem nextItem) if (nextItem.StartTime.HasValue) { var elapsed = Stopwatch.GetElapsedTime(nextItem.StartTime.Value); - Log.Information("Total Latency (Say -> Play): {Ms}ms", elapsed.TotalMilliseconds); + _latencyTracker.AddLatency(elapsed.TotalMilliseconds); + Log.Debug("Total Latency (Say -> Play): {Ms}ms", elapsed.TotalMilliseconds); } this.soundOut?.Play(); } diff --git a/src/TextToTalk/Backends/Uberduck/UberduckBackend.cs b/src/TextToTalk/Backends/Uberduck/UberduckBackend.cs index bbb97e5f..7aa097d9 100644 --- a/src/TextToTalk/Backends/Uberduck/UberduckBackend.cs +++ b/src/TextToTalk/Backends/Uberduck/UberduckBackend.cs @@ -14,16 +14,18 @@ public class UberduckBackend : VoiceBackend private readonly StreamingSoundQueue soundQueue; private readonly UberduckBackendUI ui; private readonly UberduckClient? uberduck; + private readonly LatencyTracker latencyTracker; - public UberduckBackend(PluginConfiguration config, HttpClient http) + public UberduckBackend(PluginConfiguration config, HttpClient http, LatencyTracker latencyTracker) { TitleBarColor = ImGui.ColorConvertU32ToFloat4(0xFFDE7312); - this.soundQueue = new StreamingSoundQueue(config); + this.soundQueue = new StreamingSoundQueue(config, latencyTracker); this.uberduck = new UberduckClient(this.soundQueue, http); var voices = this.uberduck.UpdateVoices().GetAwaiter().GetResult(); this.ui = new UberduckBackendUI(config, this.uberduck, () => voices, this); + this.latencyTracker = latencyTracker; } public override void DrawStyles(IConfigUIDelegates helpers) diff --git a/src/TextToTalk/Backends/VoiceBackendManager.cs b/src/TextToTalk/Backends/VoiceBackendManager.cs index 5c17c571..e115e5c5 100644 --- a/src/TextToTalk/Backends/VoiceBackendManager.cs +++ b/src/TextToTalk/Backends/VoiceBackendManager.cs @@ -24,17 +24,20 @@ public class VoiceBackendManager : VoiceBackend private readonly PluginConfiguration config; private readonly IUiBuilder uiBuilder; private readonly INotificationService notificationService; + private readonly LatencyTracker latencyTracker; public VoiceBackend? Backend { get; private set; } public bool BackendLoading { get; private set; } public VoiceBackendManager(PluginConfiguration config, HttpClient http, IUiBuilder uiBuilder, - INotificationService notificationService) + INotificationService notificationService, LatencyTracker tracker) { this.config = config; this.http = http; this.uiBuilder = uiBuilder; this.notificationService = notificationService; + this.latencyTracker = tracker; + SetBackend(this.config.Backend); } @@ -105,16 +108,16 @@ private VoiceBackend CreateBackendFor(TTSBackend backendKind) { return backendKind switch { - TTSBackend.System => new SystemBackend(this.config, this.http), - TTSBackend.Websocket => new WebsocketBackend(this.config, this.notificationService), - TTSBackend.AmazonPolly => new PollyBackend(this.config, this.http), - TTSBackend.Uberduck => new UberduckBackend(this.config, this.http), - TTSBackend.Azure => new AzureBackend(this.config, this.http), - TTSBackend.ElevenLabs => new ElevenLabsBackend(this.config, this.http, this.notificationService), - TTSBackend.OpenAi => new OpenAiBackend(this.config, this.http, this.notificationService), - TTSBackend.GoogleCloud => new GoogleCloudBackend(this.config), - TTSBackend.Kokoro => new KokoroBackend(this.config), - TTSBackend.Piper => new PiperBackend(this.config), + TTSBackend.System => new SystemBackend(this.config, this.http, this.latencyTracker), + TTSBackend.Websocket => new WebsocketBackend(this.config, this.notificationService, this.latencyTracker), + TTSBackend.AmazonPolly => new PollyBackend(this.config, this.http, this.latencyTracker), + TTSBackend.Uberduck => new UberduckBackend(this.config, this.http, this.latencyTracker), + TTSBackend.Azure => new AzureBackend(this.config, this.http, this.latencyTracker), + TTSBackend.ElevenLabs => new ElevenLabsBackend(this.config, this.http, this.notificationService, this.latencyTracker), + TTSBackend.OpenAi => new OpenAiBackend(this.config, this.http, this.notificationService, this.latencyTracker), + TTSBackend.GoogleCloud => new GoogleCloudBackend(this.config, this.latencyTracker), + TTSBackend.Kokoro => new KokoroBackend(this.config, this.latencyTracker), + TTSBackend.Piper => new PiperBackend(this.config, this.latencyTracker), _ => throw new ArgumentOutOfRangeException(nameof(backendKind)), }; } diff --git a/src/TextToTalk/Backends/Websocket/WebsocketBackend.cs b/src/TextToTalk/Backends/Websocket/WebsocketBackend.cs index 058e01a4..e9be0d60 100644 --- a/src/TextToTalk/Backends/Websocket/WebsocketBackend.cs +++ b/src/TextToTalk/Backends/Websocket/WebsocketBackend.cs @@ -23,7 +23,9 @@ public class WebsocketBackend : VoiceBackend private bool dirtyConfig; private Exception? lastException; - public WebsocketBackend(PluginConfiguration config, INotificationService notificationService) + private readonly LatencyTracker latencyTracker; + + public WebsocketBackend(PluginConfiguration config, INotificationService notificationService, LatencyTracker latencyTracker) { this.config = config; @@ -40,6 +42,7 @@ public WebsocketBackend(PluginConfiguration config, INotificationService notific } this.wsServer.Start(); + this.latencyTracker = latencyTracker; } public override void DrawStyles(IConfigUIDelegates helpers) diff --git a/src/TextToTalk/CommandModules/MainCommandModule.cs b/src/TextToTalk/CommandModules/MainCommandModule.cs index dd516c95..91264bd9 100644 --- a/src/TextToTalk/CommandModules/MainCommandModule.cs +++ b/src/TextToTalk/CommandModules/MainCommandModule.cs @@ -14,9 +14,10 @@ public class MainCommandModule : CommandModule private readonly ConfigurationWindow configurationWindow; private readonly IConfigUIDelegates configUIDelegates; private readonly VoiceStyles StylesWindow; + private readonly StatsWindow StatsWindow; public MainCommandModule(ICommandManager commandManager, IChatGui chat, PluginConfiguration config, - VoiceBackendManager backendManager, ConfigurationWindow configurationWindow, IConfigUIDelegates configUIDelegates, VoiceStyles StylesWindow) : base(commandManager) //ElevenLabsStylesWindow elevenLabsStylesWindow) + VoiceBackendManager backendManager, ConfigurationWindow configurationWindow, IConfigUIDelegates configUIDelegates, VoiceStyles StylesWindow, StatsWindow statsWindow) : base(commandManager) //ElevenLabsStylesWindow elevenLabsStylesWindow) { this.chat = chat; @@ -25,6 +26,7 @@ public MainCommandModule(ICommandManager commandManager, IChatGui chat, PluginCo this.configurationWindow = configurationWindow; this.configUIDelegates = configUIDelegates; this.StylesWindow = StylesWindow; + this.StatsWindow = statsWindow; AddCommand("/canceltts", CancelTts, "Cancel all queued TTS messages."); AddCommand("/toggletts", ToggleTts, "Toggle TextToTalk's text-to-speech."); @@ -32,6 +34,8 @@ public MainCommandModule(ICommandManager commandManager, IChatGui chat, PluginCo AddCommand("/enabletts", EnableTts, "Enable TextToTalk's text-to-speech."); AddCommand("/tttconfig", ToggleConfig, "Toggle TextToTalk's configuration window."); AddCommand("/tttstyles", ToggleStyles, "Toggle TextToTalk's styles window."); + AddCommand("/tttstats", ToggleStats, "Toggle TextToTalk's latency stats window."); + } public void CancelTts(string command = "", string args = "") @@ -70,4 +74,9 @@ public void ToggleStyles(string command = "", string args = "") { this.StylesWindow.Toggle(); } + + public void ToggleStats(string command = "", string args = "") + { + this.StatsWindow.Toggle(); + } } \ No newline at end of file diff --git a/src/TextToTalk/TextToTalk.cs b/src/TextToTalk/TextToTalk.cs index 5e06fa40..657429d0 100644 --- a/src/TextToTalk/TextToTalk.cs +++ b/src/TextToTalk/TextToTalk.cs @@ -92,6 +92,9 @@ public partial class TextToTalk : IDalamudPlugin private readonly IConfigUIDelegates configUIDelegates; private readonly VoiceStyles StylesWindow; + private readonly LatencyTracker tracker; + private readonly StatsWindow statsWindow; + public string Name => "TextToTalk"; @@ -120,6 +123,10 @@ public TextToTalk( this.framework = framework; this.data = data; + this.tracker = new LatencyTracker(); + this.statsWindow = new StatsWindow(this.tracker); + + CreateDatabasePath(); CreateEventLogDatabase(); this.database = new LiteDatabase(GetDatabasePath("TextToTalk.db")); @@ -141,7 +148,7 @@ public TextToTalk( var sharedState = new SharedState(); this.http = new HttpClient(); - this.backendManager = new VoiceBackendManager(this.config, this.http, pi.UiBuilder, this.notificationService); + this.backendManager = new VoiceBackendManager(this.config, this.http, pi.UiBuilder, this.notificationService, this.tracker); this.StylesWindow = new VoiceStyles(this.backendManager, this.configUIDelegates, this.config); this.playerService = new PlayerService(playerCollection, this.config.GetVoiceConfig().VoicePresets); this.npcService = new NpcService(npcCollection, this.config.GetVoiceConfig().VoicePresets); @@ -169,6 +176,7 @@ public TextToTalk( this.windows.AddWindow(this.configurationWindow); this.windows.AddWindow(channelPresetModificationWindow); this.windows.AddWindow(this.StylesWindow); + this.windows.AddWindow(this.statsWindow); var filters = new MessageHandlerFilters(sharedState, this.config, this.clientState); this.addonTalkHandler = @@ -187,10 +195,10 @@ public TextToTalk( this.commandModule = new MainCommandModule(commandManager, chat, this.config, this.backendManager, - this.configurationWindow, this.configUIDelegates, this.StylesWindow); + this.configurationWindow, this.configUIDelegates, this.StylesWindow, this.statsWindow); this.debugCommandModule = new DebugCommandModule(commandManager, chat, gui, framework); - + RegisterCallbacks(); var handleTextCancel = HandleTextCancel(); @@ -604,4 +612,45 @@ public void Dispose() #endregion } + public class LatencyTracker + { + private readonly List history = new(); + private readonly object historyLock = new(); + + + public double AverageLatency + { + get + { + lock (historyLock) + { + return history.Count == 0 ? 0 : history.Average(); + } + } + } + public float[] GetHistoryArray() + { + lock (historyLock) + { + return history.Select(d => (float)d).ToArray(); + } + } + + public void AddLatency(double ms) + { + lock (historyLock) + { + history.Add(ms); + if (history.Count > 100) history.RemoveAt(0); // Keep last 100 requests + } + } + + public void Clear() + { + lock (historyLock) + { + history.Clear(); + } + } + } } \ No newline at end of file diff --git a/src/TextToTalk/UI/Windows/StatsWindow.cs b/src/TextToTalk/UI/Windows/StatsWindow.cs new file mode 100644 index 00000000..8ce7a920 --- /dev/null +++ b/src/TextToTalk/UI/Windows/StatsWindow.cs @@ -0,0 +1,51 @@ +using Dalamud.Bindings.ImGui; +using Dalamud.Interface.Windowing; +using System; +using System.Linq; +using TextToTalk; +using TextToTalk.Backends; + +public class StatsWindow : Window +{ + private float[] dataArray = Array.Empty(); + private DateTime lastUpdateTime = DateTime.MinValue; + private readonly object updateLock = new(); + + public interface IWindowController + { + void ToggleStats(); + } + + private readonly LatencyTracker tracker; + public bool IsVisible = false; + + public StatsWindow(TextToTalk.LatencyTracker tracker) : base("TTS Statistics") + { + this.tracker = tracker; + } + + public override void Draw() + { + float[] fullDataArray = tracker.GetHistoryArray(); + + ImGui.Text($"Average Latency: {tracker.AverageLatency:F2} ms"); + ImGui.SameLine(); + if (ImGui.Button("Clear History")) + { + tracker.Clear(); + } + + if (ImGui.TreeNode("View Raw History")) + { + if (ImGui.BeginChild("RawDataList", new System.Numerics.Vector2(0, 150))) + { + for (int i = 0; i < fullDataArray.Length; i++) + { + ImGui.Text($"[{i:000}] {fullDataArray[i]:F2} ms"); + } + ImGui.EndChild(); + } + ImGui.TreePop(); + } + } +} \ No newline at end of file From 1bc7e78d842e49069fd2c57100f0b3c2fd6a5888 Mon Sep 17 00:00:00 2001 From: mitcheb0219 Date: Wed, 21 Jan 2026 14:32:46 -0500 Subject: [PATCH 11/13] Update StatsWindow.cs --- src/TextToTalk/UI/Windows/StatsWindow.cs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/TextToTalk/UI/Windows/StatsWindow.cs b/src/TextToTalk/UI/Windows/StatsWindow.cs index 8ce7a920..2a4f3321 100644 --- a/src/TextToTalk/UI/Windows/StatsWindow.cs +++ b/src/TextToTalk/UI/Windows/StatsWindow.cs @@ -37,12 +37,15 @@ public override void Draw() if (ImGui.TreeNode("View Raw History")) { - if (ImGui.BeginChild("RawDataList", new System.Numerics.Vector2(0, 150))) + // *** REFACTORED: Use the boolean 'true' for drawing a border *** + // The signature now accepts a boolean instead of ImGuiChildFlags + if (ImGui.BeginChild("RawDataList", new System.Numerics.Vector2(0, 150), true)) // The 'true' draws the border { for (int i = 0; i < fullDataArray.Length; i++) { ImGui.Text($"[{i:000}] {fullDataArray[i]:F2} ms"); } + ImGui.EndChild(); } ImGui.TreePop(); From 5fd9783a8151f03e85dc5553f6064ee7463a1762 Mon Sep 17 00:00:00 2001 From: mitcheb0219 Date: Wed, 21 Jan 2026 21:24:08 -0500 Subject: [PATCH 12/13] Updated Latency Tracker. Added button to Config Window --- src/TextToTalk/UI/ConfigurationWindow.cs | 19 +++++++++++++------ src/TextToTalk/UI/Windows/StatsWindow.cs | 17 +++++++++-------- src/TextToTalk/UI/Windows/StylesWindow.cs | 5 ----- 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/TextToTalk/UI/ConfigurationWindow.cs b/src/TextToTalk/UI/ConfigurationWindow.cs index 64fcf5e1..7c59aa8b 100644 --- a/src/TextToTalk/UI/ConfigurationWindow.cs +++ b/src/TextToTalk/UI/ConfigurationWindow.cs @@ -1,19 +1,20 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Numerics; -using System.Text; +using Dalamud.Bindings.ImGui; using Dalamud.Game.Text; using Dalamud.Interface; using Dalamud.Interface.Windowing; using Dalamud.Plugin.Services; -using Dalamud.Bindings.ImGui; using Lumina.Excel.Sheets; using R3; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Numerics; +using System.Text; using TextToTalk.Backends; using TextToTalk.Data.Model; using TextToTalk.GameEnums; using TextToTalk.Services; +using TextToTalk.UI.Windows; namespace TextToTalk.UI { @@ -295,6 +296,12 @@ private void DrawSynthesizerSettings() // I'm sure there's a cleaner method to c this.config); Components.Tooltip( "Removes \"stuttering\" from NPC dialogue such as \"H-hello, nice to m-meet you...\""); + + + if (ImGui.Button($"Show Latency Data##{MemoizedId.Create()}")) + { + StatsWindow.Instance?.ToggleStats(); + } } } diff --git a/src/TextToTalk/UI/Windows/StatsWindow.cs b/src/TextToTalk/UI/Windows/StatsWindow.cs index 2a4f3321..c729204b 100644 --- a/src/TextToTalk/UI/Windows/StatsWindow.cs +++ b/src/TextToTalk/UI/Windows/StatsWindow.cs @@ -4,17 +4,14 @@ using System.Linq; using TextToTalk; using TextToTalk.Backends; +using TextToTalk.UI.Windows; public class StatsWindow : Window { private float[] dataArray = Array.Empty(); private DateTime lastUpdateTime = DateTime.MinValue; private readonly object updateLock = new(); - - public interface IWindowController - { - void ToggleStats(); - } + public static StatsWindow? Instance { get; private set; } private readonly LatencyTracker tracker; public bool IsVisible = false; @@ -22,6 +19,12 @@ public interface IWindowController public StatsWindow(TextToTalk.LatencyTracker tracker) : base("TTS Statistics") { this.tracker = tracker; + Instance = this; + } + + public void ToggleStats() + { + this.IsOpen = !this.IsOpen; } public override void Draw() @@ -37,9 +40,7 @@ public override void Draw() if (ImGui.TreeNode("View Raw History")) { - // *** REFACTORED: Use the boolean 'true' for drawing a border *** - // The signature now accepts a boolean instead of ImGuiChildFlags - if (ImGui.BeginChild("RawDataList", new System.Numerics.Vector2(0, 150), true)) // The 'true' draws the border + if (ImGui.BeginChild("RawDataList", new System.Numerics.Vector2(0, 150), true)) { for (int i = 0; i < fullDataArray.Length; i++) { diff --git a/src/TextToTalk/UI/Windows/StylesWindow.cs b/src/TextToTalk/UI/Windows/StylesWindow.cs index 2970ce8f..d00d99c5 100644 --- a/src/TextToTalk/UI/Windows/StylesWindow.cs +++ b/src/TextToTalk/UI/Windows/StylesWindow.cs @@ -29,11 +29,6 @@ public interface IVoiceStylesWindow void Draw(IConfigUIDelegates helpers); } - public interface IWindowController - { - void ToggleStyle(); - } - public class VoiceStyles : Window { private readonly VoiceBackendManager backendManager; From 15f0565477621cc8000d9ccf1087a71a6e8921ea Mon Sep 17 00:00:00 2001 From: mitcheb0219 Date: Fri, 23 Jan 2026 22:17:06 -0500 Subject: [PATCH 13/13] Updates according to notes in existing PR --- src/TextToTalk/Backends/Azure/AzureClient.cs | 3 +- .../Backends/ElevenLabs/ElevenLabsBackend.cs | 12 ++- .../GoogleCloud/GoogleCloudBackend.cs | 2 +- .../GoogleCloud/GoogleCloudBackendUI.cs | 9 ++ .../Backends/GoogleCloud/GoogleCloudClient.cs | 100 +++++++++++------- .../GoogleCloud/GoogleCloudVoicePreset.cs | 3 + .../Backends/OpenAI/OpenAiBackend.cs | 46 ++++---- .../Backends/OpenAI/OpenAiBackendUIModel.cs | 14 --- src/TextToTalk/Backends/Piper/PiperBackend.cs | 6 +- src/TextToTalk/Backends/StreamFormat.cs | 8 +- .../Backends/StreamingSoundQueue.cs | 25 ++--- .../Backends/Uberduck/UberduckBackendUI.cs | 17 ++- .../Backends/Uberduck/UberduckClient.cs | 4 +- 13 files changed, 141 insertions(+), 108 deletions(-) diff --git a/src/TextToTalk/Backends/Azure/AzureClient.cs b/src/TextToTalk/Backends/Azure/AzureClient.cs index be1df8e3..846c7178 100644 --- a/src/TextToTalk/Backends/Azure/AzureClient.cs +++ b/src/TextToTalk/Backends/Azure/AzureClient.cs @@ -108,7 +108,7 @@ public async Task Say(string? voice, int playbackRate, float volume, TextSource var chunkStream = new MemoryStream(chunk); long? timestampToPass = methodStart; - soundQueue.EnqueueSound(chunkStream, source, volume, StreamFormat.Azure, null, timestampToPass); + soundQueue.EnqueueSound(chunkStream, source, volume, StreamFormat.Wave16K, null, timestampToPass); } } @@ -150,6 +150,5 @@ public void Dispose() { this.synthesizer?.Dispose(); this.soundQueue?.Dispose(); - this.soundQueue?.Dispose(); } } \ No newline at end of file diff --git a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackend.cs b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackend.cs index ae953d07..63c027b3 100644 --- a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackend.cs +++ b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackend.cs @@ -9,18 +9,22 @@ namespace TextToTalk.Backends.ElevenLabs; public class ElevenLabsBackend : VoiceBackend { + private readonly ElevenLabsClient client; private readonly ElevenLabsBackendUI ui; private readonly ElevenLabsBackendUIModel uiModel; private readonly INotificationService notificationService; private readonly PluginConfiguration config; private readonly LatencyTracker latencyTracker; + private readonly StreamingSoundQueue soundQueue; public ElevenLabsBackend(PluginConfiguration config, HttpClient http, INotificationService notificationService, LatencyTracker latencyTracker) { + this.soundQueue = new StreamingSoundQueue(config, latencyTracker); this.uiModel = new ElevenLabsBackendUIModel(config, http, latencyTracker); this.ui = new ElevenLabsBackendUI(uiModel, config, this); this.notificationService = notificationService; this.config = config; + this.client = new ElevenLabsClient(soundQueue, http); } public override void DrawStyles(IConfigUIDelegates helpers) @@ -68,9 +72,9 @@ await this.uiModel.ElevenLabs.Say(elevenLabsVoicePreset.VoiceId, elevenLabsVoice public override void CancelAllSpeech() { this.uiModel.SoundQueue.CancelAllSounds(); - if (this.uiModel.ElevenLabs._TtsCts != null) + if (this.client._TtsCts != null) { - this.uiModel.ElevenLabs._TtsCts.Cancel(); + this.client._TtsCts.Cancel(); } this.uiModel.SoundQueue.StopHardware(); } @@ -78,9 +82,9 @@ public override void CancelAllSpeech() public override void CancelSay(TextSource source) { this.uiModel.SoundQueue.CancelFromSource(source); - if (this.uiModel.ElevenLabs._TtsCts != null) + if (this.client._TtsCts != null) { - this.uiModel.ElevenLabs._TtsCts.Cancel(); + this.client._TtsCts.Cancel(); } this.uiModel.SoundQueue.StopHardware(); } diff --git a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackend.cs b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackend.cs index e337c0b6..178d9b16 100644 --- a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackend.cs +++ b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackend.cs @@ -26,7 +26,7 @@ public override void Say(SayRequest request) if (request.Voice is not GoogleCloudVoicePreset voicePreset) throw new InvalidOperationException("Invalid voice preset provided."); - _ = client.Say(voicePreset.Locale, voicePreset.VoiceName, voicePreset.PlaybackRate, voicePreset.Volume, request.Source, + _ = client.Say(voicePreset.Locale, voicePreset.VoiceName, voicePreset.SampleRate, voicePreset.PlaybackRate, voicePreset.Volume, request.Source, request.Text); } diff --git a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackendUI.cs b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackendUI.cs index 38ed9a65..d6123d1a 100644 --- a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackendUI.cs +++ b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackendUI.cs @@ -101,6 +101,15 @@ public void DrawVoicePresetOptions() ImGui.EndCombo(); } + var validSampleRates = new[] { "8000", "16000", "22050", "24000" }; + var sampleRate = currentVoicePreset.SampleRate.ToString(); + var sampleRateIndex = Array.IndexOf(validSampleRates, sampleRate); + if (ImGui.Combo($"Sample rate##{MemoizedId.Create()}", ref sampleRateIndex, validSampleRates, + validSampleRates.Length)) + { + currentVoicePreset.SampleRate = int.Parse(validSampleRates[sampleRateIndex]); + this.config.Save(); + } var playbackRate = currentVoicePreset.PlaybackRate ?? 1; if (ImGui.SliderFloat($"Playback rate##{MemoizedId.Create()}", ref playbackRate, 0.25f, 2f, "%.2fx")) diff --git a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs index 7c99da1e..70ed4029 100644 --- a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs +++ b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs @@ -44,14 +44,11 @@ public void Init(string pathToCredential) foreach (var voice in response.Voices) { - if (voice.Name.Contains("Chirp3") || voice.Name.Contains("Chirp-HD")) // Focusing on Chirp 3 and Chirp HD voices as these are the only ones enabled for streaming. From what I can tell, this actually reduces duplicates of the same voice under different formats. + fetchedVoices.Add(voice.Name, new { - fetchedVoices.Add(voice.Name, new - { - Name = voice.Name, - Gender = voice.SsmlGender, - }); - } + Name = voice.Name, + Gender = voice.SsmlGender, + }); } return fetchedVoices; @@ -73,11 +70,15 @@ public List ExtractUniqueLocales(List? voicesList) return uniqueLocales.ToList().OrderBy(lang => lang).ToList(); } - public async Task Say(string? locale, string? voice, float? speed, float volume, TextSource source, string text) + public async Task Say(string? locale, string? voice, int? rate, float? speed, float volume, TextSource source, string text) { long methodStart = Stopwatch.GetTimestamp(); if (client == null || soundQueue == null || locale == null) return; + bool isStreamingSupported = voice != null && + (voice.Contains("Chirp3-HD", StringComparison.OrdinalIgnoreCase) || + voice.Contains("Chirp-HD", StringComparison.OrdinalIgnoreCase)); + if (_TtsCts != null) { _TtsCts?.Cancel(); @@ -86,54 +87,71 @@ public async Task Say(string? locale, string? voice, float? speed, float volume, _TtsCts = new CancellationTokenSource(); var ct = _TtsCts.Token; + + var sampleRate = rate switch + { + 24000 => StreamFormat.Wave, + 22050 => StreamFormat.Wave22K, + 16000 => StreamFormat.Wave16K, + 8000 => StreamFormat.Wave8K, + _ => StreamFormat.Wave22K + }; try { - using var streamingCall = client.StreamingSynthesize(); // One request to open the stream - - var configRequest = new StreamingSynthesizeRequest + if (isStreamingSupported) { - StreamingConfig = new StreamingSynthesizeConfig + using var streamingCall = client.StreamingSynthesize(); + + await streamingCall.WriteAsync(new StreamingSynthesizeRequest { - Voice = new VoiceSelectionParams + StreamingConfig = new StreamingSynthesizeConfig { - LanguageCode = locale, - Name = voice ?? "en-US-Chirp3-HD-Puff-A" - }, - StreamingAudioConfig = new StreamingAudioConfig + Voice = new VoiceSelectionParams { LanguageCode = locale, Name = voice }, + StreamingAudioConfig = new StreamingAudioConfig + { + AudioEncoding = AudioEncoding.Pcm, + SampleRateHertz = rate ?? 22050, + SpeakingRate = speed ?? 1.0f, + } + } + }); + + await streamingCall.WriteAsync(new StreamingSynthesizeRequest { Input = new StreamingSynthesisInput { Text = text } }); + await streamingCall.WriteCompleteAsync(); + + await foreach (var response in streamingCall.GetResponseStream().WithCancellation(ct)) + { + if (response.AudioContent.Length > 0) { - AudioEncoding = AudioEncoding.Pcm, - SampleRateHertz = 24000, - SpeakingRate = speed ?? 1.0f, + var chunkStream = new MemoryStream(response.AudioContent.ToByteArray()); + soundQueue.EnqueueSound(chunkStream, source, volume, sampleRate, null, methodStart); } } - }; - await streamingCall.WriteAsync(configRequest); - - await streamingCall.WriteAsync(new StreamingSynthesizeRequest // One request to send the text and write back the chunks + } + else { - Input = new StreamingSynthesisInput { Text = text } - }); - - await streamingCall.WriteCompleteAsync(); + + var response = await client.SynthesizeSpeechAsync(new SynthesizeSpeechRequest + { + Input = new SynthesisInput { Text = text }, + Voice = new VoiceSelectionParams { LanguageCode = locale, Name = voice }, + AudioConfig = new AudioConfig + { + AudioEncoding = AudioEncoding.Linear16, + SampleRateHertz = rate ?? 22050, + SpeakingRate = speed ?? 1.0f, + } + }, ct); - await foreach (var response in streamingCall.GetResponseStream().WithCancellation(ct)) - { if (response.AudioContent.Length > 0) { - var chunkStream = new MemoryStream(response.AudioContent.ToByteArray()); - long? timestampToPass = methodStart; - soundQueue.EnqueueSound(chunkStream, source, volume, StreamFormat.Wave, null, timestampToPass); + var audioStream = new MemoryStream(response.AudioContent.ToByteArray()); + soundQueue.EnqueueSound(audioStream, source, volume, sampleRate, null, methodStart); } } } - catch (OperationCanceledException) - { - // Silent Cancellation if token is set to Cancelled - } - catch (Grpc.Core.RpcException ex) when (ex.StatusCode == Grpc.Core.StatusCode.Cancelled) - { - // Handle gRPC specific cancellation - } + catch (OperationCanceledException) { /* Silent */ } + catch (Grpc.Core.RpcException ex) when (ex.StatusCode == Grpc.Core.StatusCode.Cancelled) { /* Silent */ } } } \ No newline at end of file diff --git a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudVoicePreset.cs b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudVoicePreset.cs index 221a3c54..4a9d61d2 100644 --- a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudVoicePreset.cs +++ b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudVoicePreset.cs @@ -6,6 +6,8 @@ public class GoogleCloudVoicePreset : VoicePreset { public float Volume { get; set; } + public int? SampleRate { get; set; } + // 0.25 - 2.0 (default 1.0) public float? PlaybackRate { get; set; } @@ -19,6 +21,7 @@ public override bool TrySetDefaultValues() { Volume = 1.0f; PlaybackRate = 1.0f; + SampleRate = 22050; Locale = "en-US"; VoiceName = "en-US-Chirp-HD-D"; Gender = "Male"; diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs b/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs index 3477cfb9..e6b0c88b 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs @@ -14,14 +14,24 @@ public class OpenAiBackend : VoiceBackend { private readonly OpenAiBackendUI ui; private readonly OpenAiBackendUIModel uiModel; + private readonly OpenAiClient openAiClient; private readonly INotificationService notificationService; private readonly LatencyTracker latencyTracker; + private readonly StreamingSoundQueue soundQueue; + private string apiKey; public OpenAiBackend(PluginConfiguration config, HttpClient http, INotificationService notificationService, LatencyTracker latencyTracker) { + var credentials = OpenAiCredentialManager.LoadCredentials(); + if (credentials != null) + { + apiKey = (credentials.Password); + } + this.soundQueue = new StreamingSoundQueue(config, latencyTracker); this.uiModel = new OpenAiBackendUIModel(config, http, latencyTracker); this.ui = new OpenAiBackendUI(uiModel, config, this); this.notificationService = notificationService; + this.openAiClient = new OpenAiClient(soundQueue, apiKey); } public override void DrawStyles(IConfigUIDelegates helpers) @@ -38,7 +48,7 @@ public override void Say(SayRequest request) { try { - await this.uiModel.OpenAi.Say(request.Text, voicePreset.Model, request.Source, voicePreset.VoiceName, !string.IsNullOrWhiteSpace(request.Style) ? request.Style : (voicePreset.Style ?? string.Empty), 1.0f, voicePreset.Volume); + await this.openAiClient.Say(request.Text, voicePreset.Model, request.Source, voicePreset.VoiceName, !string.IsNullOrWhiteSpace(request.Style) ? request.Style : (voicePreset.Style ?? string.Empty), 1.0f, voicePreset.Volume); } catch (OpenAiUnauthorizedException e) { @@ -70,36 +80,36 @@ public override void Say(SayRequest request) public override void CancelAllSpeech() { - this.uiModel.SoundQueue.CancelAllSounds(); + this.soundQueue.CancelAllSounds(); - if (uiModel.OpenAi._ttsCts != null) + if (this.openAiClient._ttsCts != null) { - uiModel.OpenAi._ttsCts.Cancel(); - uiModel.OpenAi._ttsCts.Dispose(); - uiModel.OpenAi._ttsCts = null; + this.openAiClient._ttsCts.Cancel(); + this.openAiClient._ttsCts.Dispose(); + this.openAiClient._ttsCts = null; } - this.uiModel.SoundQueue.StopHardware(); + this.soundQueue.StopHardware(); } public override void CancelSay(TextSource source) { - this.uiModel.SoundQueue.CancelFromSource(source); + this.soundQueue.CancelFromSource(source); - if (uiModel.OpenAi._ttsCts != null) + if (this.openAiClient._ttsCts != null) { - uiModel.OpenAi._ttsCts.Cancel(); - uiModel.OpenAi._ttsCts.Dispose(); - uiModel.OpenAi._ttsCts = null; + this.openAiClient._ttsCts.Cancel(); + this.openAiClient._ttsCts.Dispose(); + this.openAiClient._ttsCts = null; } - if (uiModel.SoundQueue._ttsCts != null) + if (this.openAiClient._ttsCts != null) { - uiModel.OpenAi._ttsCts.Cancel(); - uiModel.OpenAi._ttsCts.Dispose(); - uiModel.OpenAi._ttsCts = null; + this.openAiClient._ttsCts.Cancel(); + this.openAiClient._ttsCts.Dispose(); + this.openAiClient._ttsCts = null; } - this.uiModel.SoundQueue.StopHardware(); + this.soundQueue.StopHardware(); } public override void DrawSettings(IConfigUIDelegates helpers) @@ -118,7 +128,7 @@ protected override void Dispose(bool disposing) { if (disposing) { - this.uiModel.SoundQueue.Dispose(); + this.soundQueue.Dispose(); } } } \ No newline at end of file diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiBackendUIModel.cs b/src/TextToTalk/Backends/OpenAI/OpenAiBackendUIModel.cs index 156a1b51..3067bf80 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiBackendUIModel.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiBackendUIModel.cs @@ -37,24 +37,10 @@ public class OpenAiBackendUIModel /// Gets the valid voices for the current voice engine. /// NOTE: Currently there is no endpoint which provides this information for OpenAI. /// - // public IReadOnlyDictionary> Voices { get; private set; } public OpenAiBackendUIModel(PluginConfiguration config, HttpClient http, LatencyTracker latencyTracker) { - SoundQueue = new StreamingSoundQueue(config, latencyTracker); - var credentials = OpenAiCredentialManager.LoadCredentials(); - if (credentials != null) - { - apiKey = (credentials.Password); - } - //RawStreamingSoundQueue = new RawStreamingSoundQueue(config); - OpenAi = new OpenAiClient(SoundQueue, apiKey); this.config = config; - this.apiKey = ""; - - // this.Voices = new Dictionary>(); - - } /// diff --git a/src/TextToTalk/Backends/Piper/PiperBackend.cs b/src/TextToTalk/Backends/Piper/PiperBackend.cs index ad5d740a..015d966a 100644 --- a/src/TextToTalk/Backends/Piper/PiperBackend.cs +++ b/src/TextToTalk/Backends/Piper/PiperBackend.cs @@ -279,9 +279,9 @@ public async Task Say(string text, PiperVoicePreset voicePreset, TextSource sour // 8. Determine Audio Format var format = voicePreset.InternalName switch { - string name when name.EndsWith("low") => StreamFormat.PiperLow, - string name when name.EndsWith("high") => StreamFormat.PiperHigh, - _ => StreamFormat.Piper // Defaults to Medium/Standard + string name when name.EndsWith("low") => StreamFormat.Wave16K, + string name when name.EndsWith("high") => StreamFormat.Wave, + _ => StreamFormat.Wave22K // Defaults to Medium/Standard }; // 9. Enqueue Stream diff --git a/src/TextToTalk/Backends/StreamFormat.cs b/src/TextToTalk/Backends/StreamFormat.cs index 607e9987..d7ae8fca 100644 --- a/src/TextToTalk/Backends/StreamFormat.cs +++ b/src/TextToTalk/Backends/StreamFormat.cs @@ -4,11 +4,9 @@ public enum StreamFormat { Mp3, Wave, + Wave8K, + Wave16K, + Wave22K, Raw, - Azure, System, - Uberduck, - Piper, - PiperLow, - PiperHigh, } \ No newline at end of file diff --git a/src/TextToTalk/Backends/StreamingSoundQueue.cs b/src/TextToTalk/Backends/StreamingSoundQueue.cs index 69716af7..a3ee4d8b 100644 --- a/src/TextToTalk/Backends/StreamingSoundQueue.cs +++ b/src/TextToTalk/Backends/StreamingSoundQueue.cs @@ -29,10 +29,10 @@ public class StreamingSoundQueue(PluginConfiguration config, LatencyTracker late private readonly object soundLock = new(); // 1. Unified Audio Configuration - private static readonly WaveFormat Uberduck = new(22050, 16, 1); + private static readonly WaveFormat Wave8k = new(8000, 16, 1); + private static readonly WaveFormat Wave16k = new(16000, 16, 1); + private static readonly WaveFormat Wave22k = new(22050, 16, 1); private static readonly WaveFormat Wave = new(24000, 16, 1); - private static readonly WaveFormat Mp3 = new(24000, 16, 1); - private static readonly WaveFormat Azure = new(16000, 16, 1); private bool _isDisposed; public CancellationTokenSource? _ttsCts; @@ -61,7 +61,7 @@ protected override void OnSoundLoop(StreamingSoundQueueItem nextItem) } // 2. Branch logic based on format (Encoded vs Raw) - if (nextItem.Format == StreamFormat.Mp3 || nextItem.Format == StreamFormat.Uberduck) + if (nextItem.Format == StreamFormat.Mp3) { ProcessMp3Stream(nextItem); } @@ -84,14 +84,8 @@ private void ProcessMp3Stream(StreamingSoundQueueItem nextItem) { Mp3Frame frame; - try - { - frame = Mp3Frame.LoadFromStream(readFullyStream); - } - catch (Exception) // Catching interruptions here - { - break; - } + frame = Mp3Frame.LoadFromStream(readFullyStream); + if (frame == null) break; @@ -154,10 +148,9 @@ private void ProcessRawPcmStream(StreamingSoundQueueItem nextItem) WaveFormat chunkFormat = nextItem.Format switch { StreamFormat.Wave => Wave, - StreamFormat.Azure => Azure, - StreamFormat.Piper => Uberduck, - StreamFormat.PiperLow => Azure, - StreamFormat.PiperHigh => Wave, + StreamFormat.Wave8K => Wave8k, + StreamFormat.Wave16K => Wave16k, + StreamFormat.Wave22K => Wave22k, _ => throw new NotSupportedException($"Format {nextItem.Format} requires a decompressor."), }; diff --git a/src/TextToTalk/Backends/Uberduck/UberduckBackendUI.cs b/src/TextToTalk/Backends/Uberduck/UberduckBackendUI.cs index a1cb4dce..d38f293f 100644 --- a/src/TextToTalk/Backends/Uberduck/UberduckBackendUI.cs +++ b/src/TextToTalk/Backends/Uberduck/UberduckBackendUI.cs @@ -116,21 +116,32 @@ public void DrawSettings(IConfigUIDelegates helpers) var voiceCategoriesFlat = voiceCategories.SelectMany(vc => vc.Value).ToList(); var voiceDisplayNames = voiceCategoriesFlat.Select(v => v.DisplayName).ToArray(); var voiceIds = voiceCategoriesFlat.Select(v => v.Name).ToArray(); + // 1. Get the index var voiceIndex = Array.IndexOf(voiceIds, currentVoicePreset.VoiceName); - if (ImGui.BeginCombo($"Voice##{MemoizedId.Create()}", voiceDisplayNames[voiceIndex])) + + // 2. Validate the index and determine the text to show in the combo box + // If -1, we show a placeholder or the raw name instead of crashing + string previewValue = (voiceIndex >= 0 && voiceIndex < voiceDisplayNames.Length) + ? voiceDisplayNames[voiceIndex] + : "Select a voice..."; // Fallback text + + if (ImGui.BeginCombo($"Voice##{MemoizedId.Create()}", previewValue)) { foreach (var (category, voices) in voiceCategories) { ImGui.Selectable(category, false, ImGuiSelectableFlags.Disabled); foreach (var voice in voices) { - if (ImGui.Selectable($" {voice.DisplayName}")) + // Highlight the currently selected item + bool isSelected = voice.Name == currentVoicePreset.VoiceName; + + if (ImGui.Selectable($" {voice.DisplayName}##{voice.Name}", isSelected)) { currentVoicePreset.VoiceName = voice.Name; this.config.Save(); } - if (voice.Name == currentVoicePreset.VoiceName) + if (isSelected) { ImGui.SetItemDefaultFocus(); } diff --git a/src/TextToTalk/Backends/Uberduck/UberduckClient.cs b/src/TextToTalk/Backends/Uberduck/UberduckClient.cs index 43673831..297b740b 100644 --- a/src/TextToTalk/Backends/Uberduck/UberduckClient.cs +++ b/src/TextToTalk/Backends/Uberduck/UberduckClient.cs @@ -83,7 +83,7 @@ public async Task Say(string voice, int playbackRate, float volume, TextSource s var waveStream = new MemoryStream(audioBytes); long? timestampToPass = methodStart; - this.soundQueue.EnqueueSound(waveStream, source, volume, StreamFormat.Uberduck, null, timestampToPass); + this.soundQueue.EnqueueSound(waveStream, source, volume, StreamFormat.Mp3, null, timestampToPass); } } } @@ -144,6 +144,8 @@ public async Task>> UpdateVoices() Log.Information($"Response = {response.StatusCode}"); } } + else { Log.Information("Authorization not set, cannot update voices."); + } return new Dictionary>(); }