diff --git a/src/TextToTalk.Lexicons/LexiconManager.cs b/src/TextToTalk.Lexicons/LexiconManager.cs index d4b58b4..3bc2d28 100644 --- a/src/TextToTalk.Lexicons/LexiconManager.cs +++ b/src/TextToTalk.Lexicons/LexiconManager.cs @@ -97,6 +97,7 @@ public string MakeSsml( bool includeSpeakAttributes = true) { + text = System.Security.SecurityElement.Escape(text); foreach (var (_, lexicon) in this.lexicons) { foreach (var lexeme in lexicon.Lexemes.Where(lexeme => text.Contains(lexeme.Grapheme))) diff --git a/src/TextToTalk.Tests/packages.lock.json b/src/TextToTalk.Tests/packages.lock.json index 7d4f627..c19dfe4 100644 --- a/src/TextToTalk.Tests/packages.lock.json +++ b/src/TextToTalk.Tests/packages.lock.json @@ -87,119 +87,114 @@ "resolved": "14.0.1", "contentHash": "y0WWyUE6dhpGdolK3iKgwys05/nZaVf4ZPtIjpLhJBZvHxkkiE23zYRo7K7uqAgoK/QvK5cqF6l3VG5AbgC6KA==" }, - "Fare": { - "type": "Transitive", - "resolved": "2.2.1", - "contentHash": "21XZo/yuXK1k0EUhdLnjgRD4n0HQYmPFchV6uaORcRc65rasZ1vdm2dmJXPBKZiIBztRRYRmmg/B76W721VWkA==" - }, "Google.Api.CommonProtos": { "type": "Transitive", - "resolved": "2.16.0", - "contentHash": "37MuZrE9AAqHAdYgFLoTHydAiXDRriQZGVKEg6fr6ASnrY5GtauYXnQrGk5x2K3NmYzEXe+wkpaPVmxjb3NKjg==", + "resolved": "2.17.0", + "contentHash": "elfQPknFr495hm7vdy6ZlgyQh6yzZq9TU7sS35L/Fj/fqjM/mUGau9gVJLhvQEtUlPjtR80hpn/m9HvBMyCXIw==", "dependencies": { - "Google.Protobuf": "[3.28.2, 4.0.0)" + "Google.Protobuf": "[3.31.1, 4.0.0]" } }, "Google.Api.Gax": { "type": "Transitive", - "resolved": "4.9.0", - "contentHash": "fjHHYcQ99u0ztqwT537rvVtJMdDy6G2VHBZ+F1cBjDGYNVZfrpk40DMQ/OpUGToT9ZGHVirhh3eJ73bw2ANVPQ==", + "resolved": "4.12.1", + "contentHash": "G62dRNOv5DolfRviT6CCrL2a5nZ/CWWdRzhADkGnpCkYSOc3QnH5xxRvZiOKuHU8weJ/pAqAqrj7+T9IWdlu2Q==", "dependencies": { "Microsoft.Bcl.AsyncInterfaces": "6.0.0", - "Newtonsoft.Json": "13.0.3" + "Newtonsoft.Json": "13.0.4" } }, "Google.Api.Gax.Grpc": { "type": "Transitive", - "resolved": "4.9.0", - "contentHash": "ToCx/0cs+wJ9j7vzKRcPAKneJVZrz/s9JhW9QsFx1dar9WzTxawQZ8xTjyieSy8tY0UiYCL1qYkn/iRrklYnSA==", + "resolved": "4.12.1", + "contentHash": "W3LjuitOWxWyvbwqeHvpgp0LdshEiTnw/pneDAfAhQ02VgU2gVEzSXfGNPsvL8hDPBXjngR/fWNme8Kungwwkw==", "dependencies": { - "Google.Api.CommonProtos": "2.16.0", - "Google.Api.Gax": "4.9.0", - "Google.Apis.Auth": "1.68.0", - "Grpc.Auth": "2.66.0", - "Grpc.Core.Api": "2.66.0", - "Grpc.Net.Client": "2.66.0", + "Google.Api.CommonProtos": "2.17.0", + "Google.Api.Gax": "4.12.1", + "Google.Apis.Auth": "1.72.0", + "Grpc.Auth": "[2.71.0, 3.0.0)", + "Grpc.Core.Api": "[2.71.0, 3.0.0)", + "Grpc.Net.Client": "[2.71.0, 3.0.0)", "Microsoft.Extensions.DependencyInjection.Abstractions": "6.0.0" } }, "Google.Apis": { "type": "Transitive", - "resolved": "1.68.0", - "contentHash": "s2MymhdpH+ybZNBeZ2J5uFgFHApBp+QXf9FjZSdM1lk/vx5VqIknJwnaWiuAzXxPrLEkesX0Q+UsiWn39yZ9zw==", + "resolved": "1.72.0", + "contentHash": "QbSJ08W7QuqsfzDPOZDHl1aFzCYwMcfBoHqQRh7koglwDN5WacShCKYMpU/zR1Pf3h3sH6JTGEeM/txAxaJuEg==", "dependencies": { - "Google.Apis.Core": "1.68.0" + "Google.Apis.Core": "1.72.0" } }, "Google.Apis.Auth": { "type": "Transitive", - "resolved": "1.68.0", - "contentHash": "hFx8Qz5bZ4w0hpnn4tSmZaaFpjAMsgVElZ+ZgVLUZ2r9i+AKcoVgwiNfv1pruNS5cCvpXqhKECbruBCfRezPHA==", + "resolved": "1.72.0", + "contentHash": "RBoFwFKBHKUjuyJf2weEnqICQLaY0TdIrdFv2yC8bsiR2VFYxizOn3C/qN1FWCCb0Uh9GhW+zwAV1yUxPjiocw==", "dependencies": { - "Google.Apis": "1.68.0", - "Google.Apis.Core": "1.68.0", + "Google.Apis": "1.72.0", + "Google.Apis.Core": "1.72.0", "System.Management": "7.0.2" } }, "Google.Apis.Core": { "type": "Transitive", - "resolved": "1.68.0", - "contentHash": "pAqwa6pfu53UXCR2b7A/PAPXeuVg6L1OFw38WckN27NU2+mf+KTjoEg2YGv/f0UyKxzz7DxF1urOTKg/6dTP9g==", + "resolved": "1.72.0", + "contentHash": "ZmYX1PU0vTKFT42c7gp4zaYcb/0TFAXrt9qw8yEz0wjvaug85+/WddlPTfT525Qei8iIUsF6t4bHYrsb2O7crg==", "dependencies": { - "Newtonsoft.Json": "13.0.3" + "Newtonsoft.Json": "13.0.4" } }, "Google.Cloud.TextToSpeech.V1": { "type": "Transitive", - "resolved": "3.9.0", - "contentHash": "JpejhPzzEQ6rdaf0nsjjJwj1CJb8Zs0x+TH27+A17KF2g0NqrgtAbpkUZTiGlQHhOzJSF1lB3amrQhbGjozJ3A==", + "resolved": "3.17.0", + "contentHash": "27vM1NEBmCqAwqagwS0aEHfRBrFy7z6Ef+BblwKMaxtUUY0amdUdeXLY/PU8RSIHtJoan1K6ZKIS6YYqzgp77g==", "dependencies": { - "Google.Api.Gax.Grpc": "[4.9.0, 5.0.0)", - "Google.LongRunning": "[3.3.0, 4.0.0)" + "Google.Api.Gax.Grpc": "[4.12.1, 5.0.0)", + "Google.LongRunning": "[3.5.0, 4.0.0)" } }, "Google.LongRunning": { "type": "Transitive", - "resolved": "3.3.0", - "contentHash": "F2SZ83Jo466Wj/s1Z7QhIAmWBXxJZQyXZpcx0P8BR7d6s0FAj67vQjeUPESSJcvsy8AqYiYBhkUr2YpZhTQeHg==", + "resolved": "3.5.0", + "contentHash": "W8xO6FA+rG8WjKOsyIjTKjeKLcyCrjBBYeEdZ4QBkKQcxmRczbrfKhKQmdorb2V35CqXeeTbue5Na6Zkgyv8ow==", "dependencies": { - "Google.Api.Gax.Grpc": "[4.8.0, 5.0.0)" + "Google.Api.Gax.Grpc": "[4.12.1, 5.0.0)" } }, "Google.Protobuf": { "type": "Transitive", - "resolved": "3.28.2", - "contentHash": "Z86ZKAB+v1B/m0LTM+EVamvZlYw/g3VND3/Gs4M/+aDIxa2JE9YPKjDxTpf0gv2sh26hrve3eI03brxBmzn92g==" + "resolved": "3.31.1", + "contentHash": "gSnJbUmGiOTdWddPhqzrEscHq9Ls6sqRDPB9WptckyjTUyx70JOOAaDLkFff8gManZNN3hllQ4aQInnQyq/Z/A==" }, "Grpc.Auth": { "type": "Transitive", - "resolved": "2.66.0", - "contentHash": "FRQlhMAcHf0GjAXIfhN6RydfZncLLXNNTOtpLL1bt57kp59vu40faW+dr6Vwl7ef/IUFfF38aiB5jvhAA/9Aow==", + "resolved": "2.71.0", + "contentHash": "t2aGh/pMgqmc3GimtYfC7VcgVY/VSbk6SLH+61wewsgK45tzxxD9nYYItT5bpLn7fbebirmHXfgJcVKIArd0cg==", "dependencies": { - "Google.Apis.Auth": "1.68.0", - "Grpc.Core.Api": "2.66.0" + "Google.Apis.Auth": "1.69.0", + "Grpc.Core.Api": "2.71.0" } }, "Grpc.Core.Api": { "type": "Transitive", - "resolved": "2.66.0", - "contentHash": "HsjsQVAHe4hqP4t4rpUnmq+MZvPdyrlPsWF4T5fbMvyP3o/lMV+KVJfDlaNH8+v0aGQTVT3EsDFufbhaWb52cw==" + "resolved": "2.71.0", + "contentHash": "QquqUC37yxsDzd1QaDRsH2+uuznWPTS8CVE2Yzwl3CvU4geTNkolQXoVN812M2IwT6zpv3jsZRc9ExJFNFslTg==" }, "Grpc.Net.Client": { "type": "Transitive", - "resolved": "2.66.0", - "contentHash": "GwkSsssXFgN9+M2U+UQWdErf61sn1iqgP+2NRBlDXATcP9vlxda0wySxd/eIL8U522+SnyFNUXlvQ5tAzGk9cA==", + "resolved": "2.71.0", + "contentHash": "U1vr20r5ngoT9nlb7wejF28EKN+taMhJsV9XtK9MkiepTZwnKxxiarriiMfCHuDAfPUm9XUjFMn/RIuJ4YY61w==", "dependencies": { - "Grpc.Net.Common": "2.66.0", + "Grpc.Net.Common": "2.71.0", "Microsoft.Extensions.Logging.Abstractions": "6.0.0" } }, "Grpc.Net.Common": { "type": "Transitive", - "resolved": "2.66.0", - "contentHash": "YJpQpIvpo0HKlsG6SHwaieyji08qfv0DdEDIewCAA0egQY08637sHOj1netLGUhzBEsCqlGC3e92TZ2uqhxnvw==", + "resolved": "2.71.0", + "contentHash": "v0c8R97TwRYwNXlC8GyRXwYTCNufpDfUtj9la+wUrZFzVWkFJuNAltU+c0yI3zu0jl54k7en6u2WKgZgd57r2Q==", "dependencies": { - "Grpc.Core.Api": "2.66.0" + "Grpc.Core.Api": "2.71.0" } }, "KokoroSharp": { @@ -245,13 +240,16 @@ }, "Microsoft.Extensions.DependencyInjection.Abstractions": { "type": "Transitive", - "resolved": "6.0.0", - "contentHash": "xlzi2IYREJH3/m6+lUrQlujzX8wDitm4QGnUu6kUXTQAWPuZY8i+ticFJbzfqaetLA6KR/rO6Ew/HuYD+bxifg==" + "resolved": "8.0.2", + "contentHash": "3iE7UF7MQkCv1cxzCahz+Y/guQbTqieyxyaWKhrRO91itI9cOKO76OHeQDahqG4MmW5umr3CcCvGmK92lWNlbg==" }, "Microsoft.Extensions.Logging.Abstractions": { "type": "Transitive", - "resolved": "6.0.0", - "contentHash": "/HggWBbTwy8TgebGSX5DBZ24ndhzi93sHUBDvP1IxbZD7FDokYzdAr6+vbWGjw2XAfR2EJ1sfKUotpjHnFWPxA==" + "resolved": "8.0.3", + "contentHash": "dL0QGToTxggRLMYY4ZYX5AMwBb+byQBd/5dMiZE07Nv73o6I5Are3C7eQTh7K2+A4ct0PVISSr7TZANbiNb2yQ==", + "dependencies": { + "Microsoft.Extensions.DependencyInjection.Abstractions": "8.0.2" + } }, "Microsoft.ML.OnnxRuntime": { "type": "Transitive", @@ -261,10 +259,36 @@ "Microsoft.ML.OnnxRuntime.Managed": "1.22.0" } }, + "Microsoft.ML.OnnxRuntime.Gpu": { + "type": "Transitive", + "resolved": "1.23.2", + "contentHash": "4GNQUc6FHiWHvp95Yhu95SUDa6HVm+RSQxm7QCH3PIlderDhTPdU98fHHKXmLy4xIQikkEraMcGe+KXEQU5tew==", + "dependencies": { + "Microsoft.ML.OnnxRuntime.Gpu.Linux": "1.23.2", + "Microsoft.ML.OnnxRuntime.Gpu.Windows": "1.23.2", + "Microsoft.ML.OnnxRuntime.Managed": "1.23.2" + } + }, + "Microsoft.ML.OnnxRuntime.Gpu.Linux": { + "type": "Transitive", + "resolved": "1.23.2", + "contentHash": "bcv2zpP8GNnfdUCkOjE9lzIoslAOCuY0T9QHpI5+Qm6qUcehRPtGC8wF4nvySwyfTe0g3rVINP3SSj1zinkE7Q==", + "dependencies": { + "Microsoft.ML.OnnxRuntime.Managed": "1.23.2" + } + }, + "Microsoft.ML.OnnxRuntime.Gpu.Windows": { + "type": "Transitive", + "resolved": "1.23.2", + "contentHash": "qOU3DVcxq4XalFV3wlrNrdatYWufIqvg8FZqVC3LS2rFPoTfl++xpMC2nnaxB2Wc5jrpDb2izrcDsQatCyjVnA==", + "dependencies": { + "Microsoft.ML.OnnxRuntime.Managed": "1.23.2" + } + }, "Microsoft.ML.OnnxRuntime.Managed": { "type": "Transitive", - "resolved": "1.22.0", - "contentHash": "zlG3eY5mJnx1BhYAxRwpuHCGHzl3B+cY5/se0RmlVBw6Yh6QTGjPAXdjhlBIcw6BPFhgMn9lxWPE/U3Fvis+BQ==", + "resolved": "1.23.2", + "contentHash": "HtlQuzmVrqhnkmwfmkQ+2re8xPxtVmeLRQaYSJ3pXfzKs4b36+yBfa/LnDuzfX1bGcyWn/McKxmbY87TCAmo1Q==", "dependencies": { "System.Numerics.Tensors": "9.0.0" } @@ -339,14 +363,22 @@ }, "Newtonsoft.Json": { "type": "Transitive", - "resolved": "13.0.3", - "contentHash": "HrC5BXdl00IP9zeV+0Z848QWPAoCr9P3bDEZguI+gkLcBKAOxix/tLEAAHC+UvDNPv4a2d18lOReHMOagPa+zQ==" + "resolved": "13.0.4", + "contentHash": "pdgNNMai3zv51W5aq268sujXUyx7SNdE2bj1wZcWjAQrKMFZV260lbqYop1d2GM67JI1huLRwxo9ZqnfF/lC6A==" }, "NumSharp": { "type": "Transitive", "resolved": "0.30.0", "contentHash": "1f8m2B/m/ZSsICaqLszspCyA9/sTHK7wBKEH5KsxGg/r3QCYTc2HnfYOGMeCytvo8/j0v/umn5umLOLhdExlFA==" }, + "OpenAI": { + "type": "Transitive", + "resolved": "2.8.0", + "contentHash": "KcYpZ9IhuxFD2hGAJlL5vABtkr00CjeJU0SY8CjZQyzvzkzLop8jhdX3iDvteVJg6e3y4TEiY+Kti4gDJAagnA==", + "dependencies": { + "System.ClientModel": "1.8.1" + } + }, "OpenTK.Audio.OpenAL": { "type": "Transitive", "resolved": "5.0.0-pre.13", @@ -366,16 +398,44 @@ "resolved": "5.0.0-pre.13", "contentHash": "65qbZS49AfrTM6jtZ2RDTWAzLe13ywCXIiSP5QrAJLmZT6sQqHGd1LfFXLhx8Ccp77qy7qh/LHsxpUOlkgZTCg==" }, + "PiperSharp": { + "type": "Transitive", + "resolved": "1.0.6", + "contentHash": "g68TbampKc0ATx80nur6LHHrhIpXvmioIVuwAuWKcjTXTB2tf+Klk4JPwzWZRo+DRSR4kS370eh+davEQVR0cw==", + "dependencies": { + "NAudio": "2.2.1", + "NAudio.Core": "2.2.1", + "Newtonsoft.Json": "13.0.1", + "SharpCompress": "0.36.0" + } + }, "R3": { "type": "Transitive", "resolved": "1.2.9", "contentHash": "dKMFt90XW+n7JK2P40dx9uuLg57Pcj4cA/9n1NwdKWFcMAM6j49OU8h9EborpVe4KXI+2MV/EjKc1LG7fhQJUA==" }, + "SharpCompress": { + "type": "Transitive", + "resolved": "0.36.0", + "contentHash": "48am//T6Ou+GmyPmBaxaFN1ym0VNidRcBeANr9+OYTzpKRz8QMGzAkHVkCV30lFQ/gnWqGr50AuebahpG1C6xA==", + "dependencies": { + "ZstdSharp.Port": "0.7.4" + } + }, "Standart.Hash.xxHash": { "type": "Transitive", "resolved": "4.0.5", "contentHash": "2QC9zDPFT/SOnP7iFdK3AwakEcJ7D3zDoU7IwIAOyEhY4WQ2GQBvLqZ29/R1BSujPNtGHMITmVW1d+VjvLg6lg==" }, + "System.ClientModel": { + "type": "Transitive", + "resolved": "1.8.1", + "contentHash": "4oUQgw/vaO4FBOk3YsH40hbrjxRED1l95rRLvTMtHXfQxapXya9IfPpm/KgwValFFtYTfYGFOs/qzGmGyexicQ==", + "dependencies": { + "Microsoft.Extensions.Logging.Abstractions": "8.0.3", + "System.Memory.Data": "8.0.1" + } + }, "System.CodeDom": { "type": "Transitive", "resolved": "7.0.0", @@ -402,6 +462,11 @@ "System.CodeDom": "7.0.0" } }, + "System.Memory.Data": { + "type": "Transitive", + "resolved": "8.0.1", + "contentHash": "BVYuec3jV23EMRDeR7Dr1/qhx7369dZzJ9IWy2xylvb4YfXsrUxspWc4UWYid/tj4zZK58uGZqn2WQiaDMhmAg==" + }, "System.Numerics.Tensors": { "type": "Transitive", "resolved": "9.0.5", @@ -457,17 +522,24 @@ "resolved": "15.3.0", "contentHash": "F93japYa9YrJ59AZGhgdaUGHN7ITJ55FBBg/D/8C0BDgahv/rQD6MOSwHxOJJpon1kYyslVbeBrQ2wcJhox01w==" }, + "ZstdSharp.Port": { + "type": "Transitive", + "resolved": "0.7.4", + "contentHash": "ziptnotpUJr51afwXJQ5Wc03dvDiZAdmxS08s1g7SHn/VzbyZUXdH6yORk/zaNjzUOEE6pVZ0Nqztab0rYROgQ==" + }, "texttotalk": { "type": "Project", "dependencies": { "AWSSDK.Polly": "[3.7.401.37, )", "AdysTech.CredentialManager": "[2.6.0, )", "DalamudPackager": "[14.0.1, )", - "Fare": "[2.2.1, )", - "Google.Cloud.TextToSpeech.V1": "[3.9.0, )", + "Google.Cloud.TextToSpeech.V1": "[3.17.0, )", "KokoroSharp.CPU": "[0.6.1, )", "Microsoft.CognitiveServices.Speech": "[1.41.1, )", + "Microsoft.ML.OnnxRuntime.Gpu": "[1.23.2, )", "NAudio": "[2.2.1, )", + "OpenAI": "[2.8.0, )", + "PiperSharp": "[1.0.6, )", "R3": "[1.2.9, )", "Standart.Hash.xxHash": "[4.0.5, )", "System.Drawing.Common": "[9.0.0, )", diff --git a/src/TextToTalk/Backends/Azure/AzureBackend.cs b/src/TextToTalk/Backends/Azure/AzureBackend.cs index a6b7c26..48bb182 100644 --- a/src/TextToTalk/Backends/Azure/AzureBackend.cs +++ b/src/TextToTalk/Backends/Azure/AzureBackend.cs @@ -1,9 +1,7 @@ using Dalamud.Bindings.ImGui; -using FFXIVClientStructs.FFXIV.Client.Game.UI; using System; using System.Collections.Generic; using System.Net.Http; -using TextToTalk.Backends.ElevenLabs; using static TextToTalk.Backends.Azure.AzureClient; namespace TextToTalk.Backends.Azure; @@ -13,18 +11,19 @@ public class AzureBackend : VoiceBackend private readonly AzureBackendUI ui; private readonly AzureBackendUIModel uiModel; public List voices; + private readonly LatencyTracker latencyTracker; - public AzureBackend(PluginConfiguration config, HttpClient http) + public AzureBackend(PluginConfiguration config, HttpClient http, LatencyTracker latencyTracker) { TitleBarColor = ImGui.ColorConvertU32ToFloat4(0xFFF96800); var lexiconManager = new DalamudLexiconManager(); LexiconUtils.LoadFromConfigAzure(lexiconManager, config); - this.uiModel = new AzureBackendUIModel(config, lexiconManager); + this.uiModel = new AzureBackendUIModel(config, lexiconManager, latencyTracker); this.voices = this.uiModel.voices; this.ui = new AzureBackendUI(this.uiModel, config, lexiconManager, http, this); - + this.latencyTracker = latencyTracker; } public override void DrawStyles(IConfigUIDelegates helpers) diff --git a/src/TextToTalk/Backends/Azure/AzureBackendUIModel.cs b/src/TextToTalk/Backends/Azure/AzureBackendUIModel.cs index c86d14a..c219b83 100644 --- a/src/TextToTalk/Backends/Azure/AzureBackendUIModel.cs +++ b/src/TextToTalk/Backends/Azure/AzureBackendUIModel.cs @@ -16,6 +16,8 @@ public class AzureBackendUIModel public List voices; private AzureLoginInfo loginInfo; + private readonly LatencyTracker latencyTracker; + /// /// Gets the currently-instantiated Azure client instance. /// @@ -31,13 +33,14 @@ public class AzureBackendUIModel /// public IReadOnlyList Voices => this.voices; - public AzureBackendUIModel(PluginConfiguration config, LexiconManager lexiconManager) + public AzureBackendUIModel(PluginConfiguration config, LexiconManager lexiconManager, LatencyTracker latencyTracker) { this.config = config; this.lexiconManager = lexiconManager; this.voices = new List(); this.loginInfo = new AzureLoginInfo(); + this.latencyTracker = latencyTracker; var credentials = AzureCredentialManager.LoadCredentials(); if (credentials != null) { @@ -97,7 +100,7 @@ private bool TryAzureLogin() try { DetailedLog.Info($"Logging into Azure region {this.loginInfo.Region}"); - Azure = new AzureClient(this.loginInfo.SubscriptionKey, this.loginInfo.Region, this.lexiconManager, this.config); + Azure = new AzureClient(this.loginInfo.SubscriptionKey, this.loginInfo.Region, this.lexiconManager, this.config, this.latencyTracker); // This should throw an exception if the login failed this.voices = Azure.GetVoicesWithStyles(); return true; diff --git a/src/TextToTalk/Backends/Azure/AzureClient.cs b/src/TextToTalk/Backends/Azure/AzureClient.cs index 14cf7a4..846c717 100644 --- a/src/TextToTalk/Backends/Azure/AzureClient.cs +++ b/src/TextToTalk/Backends/Azure/AzureClient.cs @@ -1,29 +1,46 @@ -using System; +using Microsoft.CognitiveServices.Speech; +using Microsoft.CognitiveServices.Speech.Audio; +using Serilog; +using System; using System.Collections.Generic; +using System.Diagnostics; using System.IO; using System.Linq; +using System.Threading; using System.Threading.Tasks; -using Microsoft.CognitiveServices.Speech; -using Microsoft.CognitiveServices.Speech.Audio; +using System.Net.Http; using TextToTalk.Lexicons; namespace TextToTalk.Backends.Azure; public class AzureClient : IDisposable { + private readonly HttpClient _httpClient; + private readonly string _apiKey; + private readonly string _endpoint; + private readonly SpeechConfig speechConfig; private readonly SpeechSynthesizer synthesizer; - private readonly StreamSoundQueue soundQueue; - private readonly LexiconManager lexiconManager; + + private readonly StreamingSoundQueue soundQueue; + private readonly LexiconManager _lexiconManager; private readonly PluginConfiguration config; + private CancellationTokenSource? _ttsCts; - public AzureClient(string subscriptionKey, string region, LexiconManager lexiconManager, PluginConfiguration config) + public AzureClient(string subscriptionKey, string region, LexiconManager lexiconManager, PluginConfiguration config, LatencyTracker latencyTracker) { - var audioConfig = AudioConfig.FromWavFileOutput("NUL"); - this.speechConfig = SpeechConfig.FromSubscription(subscriptionKey, region); - this.synthesizer = new SpeechSynthesizer(speechConfig, audioConfig); - this.soundQueue = new StreamSoundQueue(config); - this.lexiconManager = lexiconManager; + _apiKey = subscriptionKey; + _endpoint = $"https://{region}.tts.speech.microsoft.com/cognitiveservices/v1"; + + _httpClient = new HttpClient(); + _httpClient.DefaultRequestHeaders.Add("Ocp-Apim-Subscription-Key", _apiKey); + _httpClient.DefaultRequestHeaders.Add("User-Agent", "TextToTalkApp"); + + soundQueue = new StreamingSoundQueue(config, latencyTracker); + _lexiconManager = lexiconManager; + speechConfig = SpeechConfig.FromSubscription(subscriptionKey, region); + speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm); + synthesizer = new SpeechSynthesizer(speechConfig, null); } public TextSource GetCurrentlySpokenTextSource() @@ -32,16 +49,14 @@ public TextSource GetCurrentlySpokenTextSource() } public List GetVoicesWithStyles() { - // Fetches the voice result asynchronously and waits for completion var res = this.synthesizer.GetVoicesAsync().GetAwaiter().GetResult(); HandleResult(res); - // Maps each voice to a custom object containing Name and StyleList return res.Voices.Select(voice => new VoiceDetails { Name = voice.Name, ShortName = voice.ShortName, - Styles = voice.StyleList.ToList() // StyleList is a string[] + Styles = voice.StyleList.ToList() }).ToList(); } @@ -61,34 +76,65 @@ public List GetVoices() public async Task Say(string? voice, int playbackRate, float volume, TextSource source, string text, string style) { - var ssml = this.lexiconManager.MakeSsml( - text, - style, - voice: voice, - langCode: "en-US", - playbackRate: playbackRate, - includeSpeakAttributes: true); - DetailedLog.Verbose(ssml); + long methodStart = Stopwatch.GetTimestamp(); + _ttsCts?.Cancel(); + _ttsCts = new CancellationTokenSource(); + var token = _ttsCts.Token; - var res = await this.synthesizer.SpeakSsmlAsync(ssml); + + var ssml = _lexiconManager.MakeSsml(text, style, voice, "en-US", playbackRate, true); - HandleResult(res); + using var request = new HttpRequestMessage(HttpMethod.Post, _endpoint) + { + Content = new StringContent(ssml, global::System.Text.Encoding.UTF8, "application/ssml+xml") + }; + + request.Headers.Add("X-Microsoft-OutputFormat", "raw-16khz-16bit-mono-pcm"); //Raw for lower latency - var soundStream = new MemoryStream(res.AudioData); - soundStream.Seek(0, SeekOrigin.Begin); + using var response = await _httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, token); + response.EnsureSuccessStatusCode(); - this.soundQueue.EnqueueSound(soundStream, source, StreamFormat.Wave, volume); + using var responseStream = await response.Content.ReadAsStreamAsync(token); + + byte[] buffer = new byte[4096]; + int bytesRead; + + while ((bytesRead = await responseStream.ReadAsync(buffer, 0, buffer.Length, token)) > 0) + { + if (token.IsCancellationRequested) break; + + var chunk = new byte[bytesRead]; + Buffer.BlockCopy(buffer, 0, chunk, 0, bytesRead); + + var chunkStream = new MemoryStream(chunk); + long? timestampToPass = methodStart; + soundQueue.EnqueueSound(chunkStream, source, volume, StreamFormat.Wave16K, null, timestampToPass); + + } } + public Task CancelAllSounds() { + if (this._ttsCts != null) + { + this._ttsCts.Cancel(); //signal cancellation if in progress + } + this.synthesizer.StopSpeakingAsync(); this.soundQueue.CancelAllSounds(); + this.soundQueue.StopHardware(); return Task.CompletedTask; } public Task CancelFromSource(TextSource source) { + if (this._ttsCts != null) + { + this._ttsCts.Cancel(); //signal cancellation if in progress + } + this.synthesizer.StopSpeakingAsync(); this.soundQueue.CancelFromSource(source); + this.soundQueue.StopHardware(); return Task.CompletedTask; } @@ -100,29 +146,6 @@ private static void HandleResult(SynthesisVoicesResult res) } } - private static void HandleResult(SpeechSynthesisResult res) - { - if (res.Reason == ResultReason.Canceled) - { - var cancellation = SpeechSynthesisCancellationDetails.FromResult(res); - if (cancellation.Reason == CancellationReason.Error) - { - DetailedLog.Error($"Azure request error: ({cancellation.ErrorCode}) \"{cancellation.ErrorDetails}\""); - } - else - { - DetailedLog.Warn($"Azure request failed in state \"{cancellation.Reason}\""); - } - - return; - } - - if (res.Reason != ResultReason.SynthesizingAudioCompleted) - { - DetailedLog.Warn($"Speech synthesis request completed in incomplete state \"{res.Reason}\""); - } - } - public void Dispose() { this.synthesizer?.Dispose(); diff --git a/src/TextToTalk/Backends/BackendUI.cs b/src/TextToTalk/Backends/BackendUI.cs index 05249a8..d206904 100644 --- a/src/TextToTalk/Backends/BackendUI.cs +++ b/src/TextToTalk/Backends/BackendUI.cs @@ -2,6 +2,7 @@ using System.Linq; using Dalamud.Bindings.ImGui; using TextToTalk.UI; +using TextToTalk.UI.Windows; namespace TextToTalk.Backends; @@ -143,4 +144,38 @@ public static bool ImGuiPresetCombo(string label, SortedSet selectedPresets ImGui.EndCombo(); return didPresetsChange; } + public static bool ImGuiStylesCombo(string label, string previewText, SortedSet selectedIndices, List styles) + { + // Use the passed-in string, or a placeholder if it's empty + string displayValue = !string.IsNullOrEmpty(previewText) ? previewText : "None selected"; + + bool didChange = false; + + // The second parameter of BeginCombo controls what is shown in the closed box + if (ImGui.BeginCombo(label, displayValue)) + { + for (int i = 0; i < styles.Count; i++) + { + bool isSelected = selectedIndices.Contains(i); + + // Use Selectable with DontClosePopups for multi-select + if (ImGui.Selectable(styles[i], isSelected, ImGuiSelectableFlags.DontClosePopups)) + { + if (!isSelected) + selectedIndices.Add(i); + else + selectedIndices.Remove(i); + + didChange = true; + } + + if (isSelected) + ImGui.SetItemDefaultFocus(); + } + + ImGui.EndCombo(); + } + + return didChange; + } } \ No newline at end of file diff --git a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackend.cs b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackend.cs index d9dfe69..63c027b 100644 --- a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackend.cs +++ b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackend.cs @@ -9,17 +9,22 @@ namespace TextToTalk.Backends.ElevenLabs; public class ElevenLabsBackend : VoiceBackend { + private readonly ElevenLabsClient client; private readonly ElevenLabsBackendUI ui; private readonly ElevenLabsBackendUIModel uiModel; private readonly INotificationService notificationService; private readonly PluginConfiguration config; + private readonly LatencyTracker latencyTracker; + private readonly StreamingSoundQueue soundQueue; - public ElevenLabsBackend(PluginConfiguration config, HttpClient http, INotificationService notificationService) + public ElevenLabsBackend(PluginConfiguration config, HttpClient http, INotificationService notificationService, LatencyTracker latencyTracker) { - this.uiModel = new ElevenLabsBackendUIModel(config, http); + this.soundQueue = new StreamingSoundQueue(config, latencyTracker); + this.uiModel = new ElevenLabsBackendUIModel(config, http, latencyTracker); this.ui = new ElevenLabsBackendUI(uiModel, config, this); this.notificationService = notificationService; this.config = config; + this.client = new ElevenLabsClient(soundQueue, http); } public override void DrawStyles(IConfigUIDelegates helpers) @@ -67,11 +72,21 @@ await this.uiModel.ElevenLabs.Say(elevenLabsVoicePreset.VoiceId, elevenLabsVoice public override void CancelAllSpeech() { this.uiModel.SoundQueue.CancelAllSounds(); + if (this.client._TtsCts != null) + { + this.client._TtsCts.Cancel(); + } + this.uiModel.SoundQueue.StopHardware(); } public override void CancelSay(TextSource source) { this.uiModel.SoundQueue.CancelFromSource(source); + if (this.client._TtsCts != null) + { + this.client._TtsCts.Cancel(); + } + this.uiModel.SoundQueue.StopHardware(); } public override void DrawSettings(IConfigUIDelegates helpers) diff --git a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUI.cs b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUI.cs index 02a5b76..7623702 100644 --- a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUI.cs +++ b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUI.cs @@ -150,18 +150,21 @@ public void DrawSettings() var modelDescriptionsList = modelDescriptions.Values.Select(v => v.Items.First()).ToList(); var selectedItemIndex = modelIdList.IndexOf(currentVoicePreset.ModelId); - string modelPreviewName = ""; - if (selectedItemIndex != -1) + string modelPreviewName = "Select a model..."; + bool previewHasStyles = false; + + if (selectedItemIndex >= 0 && selectedItemIndex < modelDescriptionsList.Count) { var selectedItem = modelDescriptionsList[selectedItemIndex]; modelPreviewName = $"{selectedItem.ModelId} || Cost Multiplier: {selectedItem.ModelRates["character_cost_multiplier"]}"; + if (currentVoicePreset.ModelId == "eleven_v3") { modelPreviewName += " [Styles Available]"; + previewHasStyles = true; } } - bool previewHasStyles = modelIdList[selectedItemIndex] == "eleven_v3"; string previewName = voiceIndex >= 0 ? $"{modelIdList[selectedItemIndex]} || Cost Multiplier: {modelDescriptionsList[selectedItemIndex].ModelRates["character_cost_multiplier"]}" : "Select a model..."; if (ImGui.BeginCombo($"Models##{MemoizedId.Create()}", "", ImGuiComboFlags.HeightLarge)) diff --git a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUIModel.cs b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUIModel.cs index e2b002d..fac7bf8 100644 --- a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUIModel.cs +++ b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsBackendUIModel.cs @@ -21,7 +21,7 @@ public class ElevenLabsBackendUIModel : IDisposable /// /// Gets the sound playback queue. /// - public StreamSoundQueue SoundQueue { get; } + public StreamingSoundQueue SoundQueue { get; } /// /// Gets the currently-instantiated ElevenLabs client instance. @@ -44,9 +44,9 @@ public class ElevenLabsBackendUIModel : IDisposable public IReadOnlyDictionary> Voices { get; private set; } public IReadOnlyDictionary Items, Dictionary? Rates)> Models { get; private set; } - public ElevenLabsBackendUIModel(PluginConfiguration config, HttpClient http) + public ElevenLabsBackendUIModel(PluginConfiguration config, HttpClient http, LatencyTracker latencyTracker) { - SoundQueue = new StreamSoundQueue(config); + SoundQueue = new StreamingSoundQueue(config, latencyTracker); ElevenLabs = new ElevenLabsClient(SoundQueue, http); this.config = config; this.getUserSubscriptionInfoImmediately = new ReactiveProperty(0); diff --git a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsClient.cs b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsClient.cs index 1c27094..7b8448b 100644 --- a/src/TextToTalk/Backends/ElevenLabs/ElevenLabsClient.cs +++ b/src/TextToTalk/Backends/ElevenLabs/ElevenLabsClient.cs @@ -1,14 +1,17 @@ -using System; +using Newtonsoft.Json; +using Serilog; +using System; using System.Collections.Generic; using System.Collections.Immutable; +using System.Diagnostics; using System.IO; using System.Linq; using System.Net; using System.Net.Http; using System.Net.Http.Headers; +using System.Text; +using System.Threading; using System.Threading.Tasks; -using Newtonsoft.Json; -using Serilog; namespace TextToTalk.Backends.ElevenLabs; @@ -17,11 +20,13 @@ public class ElevenLabsClient private const string UrlBase = "https://api.elevenlabs.io"; private readonly HttpClient http; - private readonly StreamSoundQueue soundQueue; + private readonly StreamingSoundQueue soundQueue; public string? ApiKey { get; set; } - public ElevenLabsClient(StreamSoundQueue soundQueue, HttpClient http) + public CancellationTokenSource? _TtsCts; + + public ElevenLabsClient(StreamingSoundQueue soundQueue, HttpClient http) { this.http = http; this.soundQueue = soundQueue; @@ -30,57 +35,67 @@ public ElevenLabsClient(StreamSoundQueue soundQueue, HttpClient http) public async Task Say(string? voice, int playbackRate, float volume, float similarityBoost, float stability, TextSource source, string text, string? model, string? style) { - if (!IsAuthorizationSet()) - { - throw new ElevenLabsMissingCredentialsException("No ElevenLabs authorization keys have been configured."); - } - Log.Information($"Style String = {style}"); - if (style != "") - { - model = "eleven_v3"; //force eleven_v3 model for styles - text = $"[{style}] " + text; //append style tag to text - } - float finalStability = stability; - if (model == "eleven_v3") // eleven_v3 only supports stability float values 0.0, 0.5, 1.0 - { - finalStability = (float)Math.Round(stability * 2.0f, MidpointRounding.AwayFromZero) / 2.0f; - } - Log.Information($"Message String = {text}"); - Log.Information($"Model String = {model}"); - var args = new ElevenLabsTextToSpeechRequest + long methodStart = Stopwatch.GetTimestamp(); + _TtsCts?.Cancel(); + _TtsCts?.Dispose(); + + _TtsCts = new CancellationTokenSource(); + var ct = _TtsCts.Token; + + try { - Text = text, - ModelId = model, - VoiceSettings = new ElevenLabsVoiceSettings + if (!IsAuthorizationSet()) { - SimilarityBoost = similarityBoost, - Stability = finalStability, - }, - }; + throw new ElevenLabsMissingCredentialsException("No ElevenLabs authorization keys have been configured."); + } - // Make the request - var uriBuilder = new UriBuilder(UrlBase) { Path = $"/v1/text-to-speech/{voice}/stream" }; - using var req = new HttpRequestMessage(HttpMethod.Post, uriBuilder.Uri); - AddAuthorization(req); - req.Headers.Add("accept", "audio/mpeg"); + if (!string.IsNullOrEmpty(style)) + { + model = "eleven_v3"; + text = $"[{style}] " + text; + } - DetailedLog.Verbose(JsonConvert.SerializeObject(args)); - using var content = new StringContent(JsonConvert.SerializeObject(args)); - content.Headers.ContentType = new MediaTypeHeaderValue("application/json"); - req.Content = content; - - var res = await this.http.SendAsync(req); - EnsureSuccessStatusCode(res); + float finalStability = stability; + if (model == "eleven_v3") + { + finalStability = (float)Math.Round(stability * 2.0f, MidpointRounding.AwayFromZero) / 2.0f; + } + + var args = new ElevenLabsTextToSpeechRequest + { + Text = text, + ModelId = model, + VoiceSettings = new ElevenLabsVoiceSettings + { + SimilarityBoost = similarityBoost, + Stability = finalStability, + }, + }; + + var uriBuilder = new UriBuilder(UrlBase) { Path = $"/v1/text-to-speech/{voice}/stream" }; - // Copy the sound to a new buffer and enqueue it - var responseStream = await res.Content.ReadAsStreamAsync(); - var mp3Stream = new MemoryStream(); - await responseStream.CopyToAsync(mp3Stream); - mp3Stream.Seek(0, SeekOrigin.Begin); + using var req = new HttpRequestMessage(HttpMethod.Post, uriBuilder.Uri); + AddAuthorization(req); + req.Headers.Add("accept", "audio/mpeg"); - this.soundQueue.EnqueueSound(mp3Stream, source, StreamFormat.Mp3, volume); + using var content = new StringContent(JsonConvert.SerializeObject(args), Encoding.UTF8, "application/json"); + req.Content = content; + + var res = await this.http.SendAsync(req, HttpCompletionOption.ResponseHeadersRead, ct); // Using ResponseHeadersRead in order to stream as synth completes + EnsureSuccessStatusCode(res); + + var responseStream = await res.Content.ReadAsStreamAsync(ct); + long? timestampToPass = methodStart; + + this.soundQueue.EnqueueSound(responseStream, source, volume, StreamFormat.Mp3, res, timestampToPass); + } + catch (OperationCanceledException) + { + Log.Information("TTS generation was cancelled."); // Catching cancellation + } } + public async Task GetUserSubscriptionInfo() { if (!IsAuthorizationSet()) diff --git a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackend.cs b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackend.cs index fae42c1..178d9b1 100644 --- a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackend.cs +++ b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackend.cs @@ -6,12 +6,13 @@ namespace TextToTalk.Backends.GoogleCloud; public class GoogleCloudBackend : VoiceBackend { private readonly GoogleCloudClient client; - private readonly StreamSoundQueue soundQueue; + private readonly StreamingSoundQueue soundQueue; private readonly GoogleCloudBackendUI ui; + private readonly LatencyTracker latencyTracker; - public GoogleCloudBackend(PluginConfiguration config) + public GoogleCloudBackend(PluginConfiguration config, LatencyTracker latencyTracker) { - soundQueue = new StreamSoundQueue(config); + soundQueue = new StreamingSoundQueue(config, latencyTracker); client = new GoogleCloudClient(soundQueue, config.GoogleCreds); ui = new GoogleCloudBackendUI(config, client, this); } @@ -25,18 +26,28 @@ public override void Say(SayRequest request) if (request.Voice is not GoogleCloudVoicePreset voicePreset) throw new InvalidOperationException("Invalid voice preset provided."); - _ = client.Say(voicePreset.Locale, voicePreset.VoiceName, voicePreset.SampleRate, voicePreset.Pitch, voicePreset.PlaybackRate, voicePreset.Volume, request.Source, + _ = client.Say(voicePreset.Locale, voicePreset.VoiceName, voicePreset.SampleRate, voicePreset.PlaybackRate, voicePreset.Volume, request.Source, request.Text); } public override void CancelAllSpeech() { soundQueue.CancelAllSounds(); + if (client._TtsCts != null) + { + client._TtsCts?.Cancel(); + } + soundQueue.StopHardware(); } public override void CancelSay(TextSource source) { soundQueue.CancelFromSource(source); + if (client._TtsCts != null) + { + client._TtsCts?.Cancel(); + } + soundQueue.StopHardware(); } public override void DrawSettings(IConfigUIDelegates helpers) diff --git a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackendUI.cs b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackendUI.cs index f08b399..d6123d1 100644 --- a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackendUI.cs +++ b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudBackendUI.cs @@ -101,7 +101,6 @@ public void DrawVoicePresetOptions() ImGui.EndCombo(); } - var validSampleRates = new[] { "8000", "16000", "22050", "24000" }; var sampleRate = currentVoicePreset.SampleRate.ToString(); var sampleRateIndex = Array.IndexOf(validSampleRates, sampleRate); @@ -112,15 +111,8 @@ public void DrawVoicePresetOptions() this.config.Save(); } - var pitch = currentVoicePreset.Pitch ?? 0; - if (ImGui.SliderFloat($"Pitch##{MemoizedId.Create()}", ref pitch, -10f, 10f, "%.2fx")) - { - currentVoicePreset.Pitch = pitch; - config.Save(); - } - var playbackRate = currentVoicePreset.PlaybackRate ?? 1; - if (ImGui.SliderFloat($"Playback rate##{MemoizedId.Create()}", ref playbackRate, 0.25f, 4f, "%.2fx")) + if (ImGui.SliderFloat($"Playback rate##{MemoizedId.Create()}", ref playbackRate, 0.25f, 2f, "%.2fx")) { currentVoicePreset.PlaybackRate = playbackRate; config.Save(); diff --git a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs index e231d78..70ed402 100644 --- a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs +++ b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudClient.cs @@ -1,9 +1,11 @@ +using Google.Cloud.TextToSpeech.V1; using System; using System.Collections.Generic; +using System.Diagnostics; using System.IO; using System.Linq; +using System.Threading; using System.Threading.Tasks; -using Google.Cloud.TextToSpeech.V1; using WebSocketSharp; namespace TextToTalk.Backends.GoogleCloud; @@ -11,11 +13,13 @@ namespace TextToTalk.Backends.GoogleCloud; public class GoogleCloudClient { private TextToSpeechClient? client; - private readonly StreamSoundQueue? soundQueue; + private readonly StreamingSoundQueue? soundQueue; public Dictionary? Voices; public List? Locales; - public GoogleCloudClient(StreamSoundQueue soundQueue, string pathToCredential) + public CancellationTokenSource? _TtsCts; + + public GoogleCloudClient(StreamingSoundQueue soundQueue, string pathToCredential) { this.soundQueue = soundQueue; if (pathToCredential.IsNullOrEmpty()) return; @@ -33,6 +37,7 @@ public void Init(string pathToCredential) public Dictionary? GetGoogleTextToSpeechVoices() { if (client == null) return new Dictionary(); + var response = client.ListVoices(""); var fetchedVoices = new Dictionary(); @@ -65,37 +70,88 @@ public List ExtractUniqueLocales(List? voicesList) return uniqueLocales.ToList().OrderBy(lang => lang).ToList(); } - public async Task Say(string? locale, string? voice, int? sampleRate, float? pitch, float? speed, float volume, TextSource source, - string text) + public async Task Say(string? locale, string? voice, int? rate, float? speed, float volume, TextSource source, string text) { - if (client == null || soundQueue == null || locale == null) + long methodStart = Stopwatch.GetTimestamp(); + if (client == null || soundQueue == null || locale == null) return; + + bool isStreamingSupported = voice != null && + (voice.Contains("Chirp3-HD", StringComparison.OrdinalIgnoreCase) || + voice.Contains("Chirp-HD", StringComparison.OrdinalIgnoreCase)); + + if (_TtsCts != null) { - return; + _TtsCts?.Cancel(); + _TtsCts?.Dispose(); } - var request = new SynthesizeSpeechRequest + _TtsCts = new CancellationTokenSource(); + var ct = _TtsCts.Token; + + var sampleRate = rate switch { - Input = new SynthesisInput { Text = text }, - Voice = new VoiceSelectionParams - { - LanguageCode = locale, - Name = voice ?? "en-US-Wavenet-A" - }, - AudioConfig = new AudioConfig - { - AudioEncoding = AudioEncoding.Mp3, - SampleRateHertz = sampleRate ?? 22050, - Pitch = pitch ?? 0, - SpeakingRate = speed ?? 1.0f, - VolumeGainDb = volume - } + 24000 => StreamFormat.Wave, + 22050 => StreamFormat.Wave22K, + 16000 => StreamFormat.Wave16K, + 8000 => StreamFormat.Wave8K, + _ => StreamFormat.Wave22K }; - var response = await client.SynthesizeSpeechAsync(request); + try + { + if (isStreamingSupported) + { + using var streamingCall = client.StreamingSynthesize(); + + await streamingCall.WriteAsync(new StreamingSynthesizeRequest + { + StreamingConfig = new StreamingSynthesizeConfig + { + Voice = new VoiceSelectionParams { LanguageCode = locale, Name = voice }, + StreamingAudioConfig = new StreamingAudioConfig + { + AudioEncoding = AudioEncoding.Pcm, + SampleRateHertz = rate ?? 22050, + SpeakingRate = speed ?? 1.0f, + } + } + }); + + await streamingCall.WriteAsync(new StreamingSynthesizeRequest { Input = new StreamingSynthesisInput { Text = text } }); + await streamingCall.WriteCompleteAsync(); - MemoryStream mp3Stream = new MemoryStream(response.AudioContent.ToByteArray()); - mp3Stream.Seek(0, SeekOrigin.Begin); + await foreach (var response in streamingCall.GetResponseStream().WithCancellation(ct)) + { + if (response.AudioContent.Length > 0) + { + var chunkStream = new MemoryStream(response.AudioContent.ToByteArray()); + soundQueue.EnqueueSound(chunkStream, source, volume, sampleRate, null, methodStart); + } + } + } + else + { + + var response = await client.SynthesizeSpeechAsync(new SynthesizeSpeechRequest + { + Input = new SynthesisInput { Text = text }, + Voice = new VoiceSelectionParams { LanguageCode = locale, Name = voice }, + AudioConfig = new AudioConfig + { + AudioEncoding = AudioEncoding.Linear16, + SampleRateHertz = rate ?? 22050, + SpeakingRate = speed ?? 1.0f, + } + }, ct); - soundQueue.EnqueueSound(mp3Stream, source, StreamFormat.Mp3, volume); + if (response.AudioContent.Length > 0) + { + var audioStream = new MemoryStream(response.AudioContent.ToByteArray()); + soundQueue.EnqueueSound(audioStream, source, volume, sampleRate, null, methodStart); + } + } + } + catch (OperationCanceledException) { /* Silent */ } + catch (Grpc.Core.RpcException ex) when (ex.StatusCode == Grpc.Core.StatusCode.Cancelled) { /* Silent */ } } } \ No newline at end of file diff --git a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudVoicePreset.cs b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudVoicePreset.cs index fe8d597..4a9d61d 100644 --- a/src/TextToTalk/Backends/GoogleCloud/GoogleCloudVoicePreset.cs +++ b/src/TextToTalk/Backends/GoogleCloud/GoogleCloudVoicePreset.cs @@ -4,14 +4,11 @@ namespace TextToTalk.Backends.GoogleCloud; public class GoogleCloudVoicePreset : VoicePreset { - public int? SampleRate { get; set; } - - // -20.0 - 20.0 is theoretical max, but it's lowered to work better with sliders (default 0.0) - public float? Pitch { get; set; } - public float Volume { get; set; } - // 0.25 - 4.0 (default 1.0) + public int? SampleRate { get; set; } + + // 0.25 - 2.0 (default 1.0) public float? PlaybackRate { get; set; } public string? Locale { get; set; } @@ -22,12 +19,11 @@ public class GoogleCloudVoicePreset : VoicePreset public override bool TrySetDefaultValues() { - SampleRate = 22050; - Pitch = 0.0f; Volume = 1.0f; PlaybackRate = 1.0f; + SampleRate = 22050; Locale = "en-US"; - VoiceName = "en-US-Wavenet-D"; + VoiceName = "en-US-Chirp-HD-D"; Gender = "Male"; EnabledBackend = TTSBackend.GoogleCloud; return true; diff --git a/src/TextToTalk/Backends/Kokoro/KokoroBackend.cs b/src/TextToTalk/Backends/Kokoro/KokoroBackend.cs index 6c29df9..efe22e7 100644 --- a/src/TextToTalk/Backends/Kokoro/KokoroBackend.cs +++ b/src/TextToTalk/Backends/Kokoro/KokoroBackend.cs @@ -1,15 +1,16 @@ +using Dalamud.Bindings.ImGui; +using Dalamud.Game; +using KokoroSharp; +using KokoroSharp.Core; +using KokoroSharp.Processing; using System; +using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.IO; using System.Net.Http; using System.Security.Cryptography; using System.Threading; using System.Threading.Tasks; -using Dalamud.Game; -using Dalamud.Bindings.ImGui; -using KokoroSharp; -using KokoroSharp.Core; -using KokoroSharp.Processing; namespace TextToTalk.Backends.Kokoro; @@ -20,18 +21,20 @@ public class KokoroBackend : VoiceBackend private readonly KokoroBackendUI ui; private readonly Task modelTask; private readonly CancellationTokenSource cts = new(); + private readonly LatencyTracker latencyTracker; - public KokoroBackend(PluginConfiguration config) + public KokoroBackend(PluginConfiguration config, LatencyTracker latencyTracker) { ui = new KokoroBackendUI(config, this); Tokenizer.eSpeakNGPath = Path.Join(config.GetPluginAssemblyDirectory(), "espeak"); modelTask = GetModelAsync(config); - soundQueue = new KokoroSoundQueue(config, modelTask); + soundQueue = new KokoroSoundQueue(config, modelTask, latencyTracker); KokoroVoiceManager.LoadVoicesFromPath(Path.Join(config.GetPluginAssemblyDirectory(), "voices")); DetailedLog.Info($"Kokoro voices loaded: {KokoroVoiceManager.Voices.Count} voices available."); + this.latencyTracker = latencyTracker; } /// @@ -105,6 +108,8 @@ public override void Say(SayRequest request) public void Say(string text, KokoroVoicePreset voicePreset, TextSource source, ClientLanguage language) { + long methodStart = Stopwatch.GetTimestamp(); + long? timestampToPass = methodStart; if (!TryGetModel(out _)) { return; @@ -120,7 +125,7 @@ public void Say(string text, KokoroVoicePreset voicePreset, TextSource source, C } // TODO: apply lexicon once KokoroSharp supports it - soundQueue.EnqueueSound(new(text, voice, voicePreset.Speed ?? 1f, voicePreset.Volume ?? 0.6f, source, language)); + soundQueue.EnqueueSound(new(text, voice, voicePreset.Speed ?? 1f, voicePreset.Volume ?? 0.6f, source, language, timestampToPass)); } public override void CancelAllSpeech() @@ -162,10 +167,6 @@ protected override void Dispose(bool disposing) cts.Cancel(); if (disposing) { - if (TryGetModel(out var model)) - { - model.Dispose(); - } soundQueue.Dispose(); } } diff --git a/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs b/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs index 65d262f..d0feae1 100644 --- a/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs +++ b/src/TextToTalk/Backends/Kokoro/KokoroSoundQueue.cs @@ -1,25 +1,35 @@ -using System.Diagnostics.CodeAnalysis; -using System.IO; -using System.Threading.Tasks; -using Dalamud.Game; +using Dalamud.Game; using KokoroSharp; using KokoroSharp.Core; using KokoroSharp.Processing; +using NAudio.CoreAudioApi; +using NAudio.Wave; +using Serilog; +using System; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Threading; +using System.Threading.Tasks; namespace TextToTalk.Backends.Kokoro; public class KokoroSoundQueue : SoundQueue { - private readonly KokoroPlayback playback = new(); - private readonly StreamSoundQueue streamSoundQueue; + private static readonly WaveFormat WaveFormat = new(24000, 16, 1); + private readonly object soundLock = new(); private readonly PluginConfiguration config; private readonly Task modelTask; + private readonly LatencyTracker latencyTracker; - public KokoroSoundQueue(PluginConfiguration config, Task modelTask) + private WasapiOut? soundOut; + private BufferedWaveProvider? bufferedProvider; + + public KokoroSoundQueue(PluginConfiguration config, Task modelTask, LatencyTracker latencyTracker) { this.config = config; this.modelTask = modelTask; - this.streamSoundQueue = new StreamSoundQueue(config); + this.latencyTracker = latencyTracker; } private bool TryGetModel([NotNullWhen(true)] out KokoroModel? model) @@ -33,85 +43,133 @@ private bool TryGetModel([NotNullWhen(true)] out KokoroModel? model) return false; } - public void EnqueueSound(KokoroSourceQueueItem item) - { - this.AddQueueItem(item); - } - - protected override void OnSoundCancelled() - { - GetCurrentItem()?.Cancel(); - } - - public override void CancelAllSounds() - { - base.CancelAllSounds(); - streamSoundQueue.CancelAllSounds(); - } - - public override void CancelFromSource(TextSource source) - { - base.CancelFromSource(source); - streamSoundQueue.CancelFromSource(source); - } - protected override void OnSoundLoop(KokoroSourceQueueItem nextItem) { - if (!TryGetModel(out var model) || nextItem.Aborted) + if (!TryGetModel(out var model) || nextItem.Aborted) return; + + lock (this.soundLock) { - return; + if (this.soundOut == null) + { + var mmDevice = GetWasapiDeviceFromGuid(config.SelectedAudioDeviceGuid); + this.bufferedProvider = new BufferedWaveProvider(WaveFormat) + { + ReadFully = false, + BufferDuration = TimeSpan.FromSeconds(30), + DiscardOnBufferOverflow = true + }; + this.soundOut = new WasapiOut(mmDevice, AudioClientShareMode.Shared, false, 50); + this.soundOut.Init(this.bufferedProvider); + } } - var lang = nextItem.Language; - - // https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md - string langCode = lang switch + string langCode = nextItem.Language switch { ClientLanguage.Japanese => "ja", - ClientLanguage.English => "en", ClientLanguage.German => "de", ClientLanguage.French => "fr", - _ => "en", + _ => config.KokoroUseAmericanEnglish ? "en-us" : "en", }; - if (langCode == "en" && config.KokoroUseAmericanEnglish) + int[] tokens = Tokenizer.Tokenize(nextItem.Text, langCode, preprocess: true); + var segments = SegmentationSystem.SplitToSegments(tokens, new() { MaxFirstSegmentLength = 200 }); + + foreach (var chunk in segments) { - langCode = "en-us"; // Use American English for English language + if (nextItem.Aborted) break; + + var samples = model.Infer(chunk, nextItem.Voice.Features, nextItem.Speed); + byte[] bytes = KokoroPlayback.GetBytes(samples); + + // POST-INFERENCE ABORT CHECK: Prevent enqueuing "zombie" audio + if (nextItem.Aborted) break; + + lock (this.soundLock) + { + if (this.bufferedProvider != null && this.soundOut != null) + { + this.bufferedProvider.AddSamples(bytes, 0, bytes.Length); + if (this.soundOut.PlaybackState != PlaybackState.Playing) + { + if (nextItem.StartTime.HasValue) + { + var elapsed = Stopwatch.GetElapsedTime(nextItem.StartTime.Value); + this.latencyTracker.AddLatency(elapsed.TotalMilliseconds); + Log.Debug("Total Latency (Say -> Play): {Ms}", elapsed.TotalMilliseconds); + } + this.soundOut.Play(); + } + } + } } - // this is a blocking call! - int[] tokens = Tokenizer.Tokenize(nextItem.Text, langCode, preprocess: true); - if (nextItem.Aborted) + // 4. Wait for audio to finish playing if not aborted + while (!nextItem.Aborted && this.bufferedProvider?.BufferedBytes > 0) { - return; + Thread.Sleep(50); } + } - var tokensList = SegmentationSystem.SplitToSegments(tokens, new() - { - MinFirstSegmentLength = 20, - MaxFirstSegmentLength = 200, - MaxSecondSegmentLength = 200 - }); // Split tokens into chunks Kokoro can handle + protected override void OnSoundCancelled() + { + GetCurrentItem()?.Cancel(); + + StopHardware(); + } - foreach (var tokenChunk in tokensList) + private void StopHardware() + { + lock (this.soundLock) { - // this is a blocking call! - var samples = model.Infer(tokenChunk, nextItem.Voice.Features, nextItem.Speed); - if (nextItem.Aborted) + if (this.soundOut != null) + { + this.soundOut.Stop(); + this.soundOut.Dispose(); + this.soundOut = null; + } + if (this.bufferedProvider != null) { - return; + this.bufferedProvider.ClearBuffer(); + this.bufferedProvider = null; } + } + } - var bytes = KokoroPlayback.GetBytes(samples); - var ms = new MemoryStream(bytes); - streamSoundQueue.EnqueueSound(ms, nextItem.Source, StreamFormat.Raw, nextItem.Volume); + private MMDevice GetWasapiDeviceFromGuid(Guid targetGuid) + { + using var enumerator = new MMDeviceEnumerator(); + var devices = enumerator.EnumerateAudioEndPoints(DataFlow.Render, DeviceState.Active); + foreach (var device in devices) + { + if (device.Properties.Contains(PropertyKeys.PKEY_AudioEndpoint_GUID)) + { + var guidString = device.Properties[PropertyKeys.PKEY_AudioEndpoint_GUID].Value as string; + if (Guid.TryParse(guidString, out var deviceGuid) && deviceGuid == targetGuid) + return device; + } } + return enumerator.GetDefaultAudioEndpoint(DataFlow.Render, Role.Console); + } + + protected override void Dispose(bool disposing) + { + if (disposing) StopHardware(); + base.Dispose(disposing); + } + + public void EnqueueSound(KokoroSourceQueueItem item) + { + this.AddQueueItem(item); } } + + + + public class KokoroSourceQueueItem : SoundQueueItem { - public KokoroSourceQueueItem(string text, KokoroVoice voice, float speed, float volume, TextSource source, ClientLanguage language) + public KokoroSourceQueueItem(string text, KokoroVoice voice, float speed, float volume, TextSource source, ClientLanguage language, long? startTime) { Source = source; Text = text; @@ -120,6 +178,7 @@ public KokoroSourceQueueItem(string text, KokoroVoice voice, float speed, float Volume = volume; Source = source; Language = language; + StartTime = startTime; } public string Text { get; } @@ -129,6 +188,8 @@ public KokoroSourceQueueItem(string text, KokoroVoice voice, float speed, float public bool Aborted { get; private set; } public ClientLanguage Language { get; } + public long? StartTime { get; set; } // Use GetTimestamp() value + internal void Cancel() { Aborted = true; diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs b/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs index 8a80f2b..e6b0c88 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiBackend.cs @@ -1,4 +1,5 @@ using Dalamud.Bindings.ImGui; +using OpenAI; using Serilog; using System; using System.Net; @@ -13,13 +14,24 @@ public class OpenAiBackend : VoiceBackend { private readonly OpenAiBackendUI ui; private readonly OpenAiBackendUIModel uiModel; + private readonly OpenAiClient openAiClient; private readonly INotificationService notificationService; + private readonly LatencyTracker latencyTracker; + private readonly StreamingSoundQueue soundQueue; + private string apiKey; - public OpenAiBackend(PluginConfiguration config, HttpClient http, INotificationService notificationService) + public OpenAiBackend(PluginConfiguration config, HttpClient http, INotificationService notificationService, LatencyTracker latencyTracker) { - this.uiModel = new OpenAiBackendUIModel(config, http); + var credentials = OpenAiCredentialManager.LoadCredentials(); + if (credentials != null) + { + apiKey = (credentials.Password); + } + this.soundQueue = new StreamingSoundQueue(config, latencyTracker); + this.uiModel = new OpenAiBackendUIModel(config, http, latencyTracker); this.ui = new OpenAiBackendUI(uiModel, config, this); this.notificationService = notificationService; + this.openAiClient = new OpenAiClient(soundQueue, apiKey); } public override void DrawStyles(IConfigUIDelegates helpers) @@ -36,8 +48,7 @@ public override void Say(SayRequest request) { try { - Log.Information($"Voice name = {voicePreset.VoiceName}"); - await this.uiModel.OpenAi.Say(voicePreset, request, request.Text, !string.IsNullOrWhiteSpace(request.Style) ? request.Style : (voicePreset.Style ?? string.Empty)); + await this.openAiClient.Say(request.Text, voicePreset.Model, request.Source, voicePreset.VoiceName, !string.IsNullOrWhiteSpace(request.Style) ? request.Style : (voicePreset.Style ?? string.Empty), 1.0f, voicePreset.Volume); } catch (OpenAiUnauthorizedException e) { @@ -69,12 +80,36 @@ public override void Say(SayRequest request) public override void CancelAllSpeech() { - this.uiModel.SoundQueue.CancelAllSounds(); + this.soundQueue.CancelAllSounds(); + + if (this.openAiClient._ttsCts != null) + { + this.openAiClient._ttsCts.Cancel(); + this.openAiClient._ttsCts.Dispose(); + this.openAiClient._ttsCts = null; + } + this.soundQueue.StopHardware(); + } public override void CancelSay(TextSource source) { - this.uiModel.SoundQueue.CancelFromSource(source); + this.soundQueue.CancelFromSource(source); + + if (this.openAiClient._ttsCts != null) + { + this.openAiClient._ttsCts.Cancel(); + this.openAiClient._ttsCts.Dispose(); + this.openAiClient._ttsCts = null; + } + + if (this.openAiClient._ttsCts != null) + { + this.openAiClient._ttsCts.Cancel(); + this.openAiClient._ttsCts.Dispose(); + this.openAiClient._ttsCts = null; + } + this.soundQueue.StopHardware(); } public override void DrawSettings(IConfigUIDelegates helpers) @@ -91,6 +126,9 @@ public override TextSource GetCurrentlySpokenTextSource() protected override void Dispose(bool disposing) { - if (disposing) this.uiModel.SoundQueue.Dispose(); + if (disposing) + { + this.soundQueue.Dispose(); + } } } \ No newline at end of file diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiBackendUI.cs b/src/TextToTalk/Backends/OpenAI/OpenAiBackendUI.cs index dd2b413..6484186 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiBackendUI.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiBackendUI.cs @@ -2,6 +2,7 @@ using Dalamud.Game; using Dalamud.Game.Text; using System; +using System.Collections.Generic; using System.Linq; using TextToTalk.UI; using TextToTalk.UI.Windows; @@ -15,6 +16,7 @@ public class OpenAiBackendUI private readonly OpenAiBackend backend; private string apiKey; + private SortedSet selectedStyleIndices = new SortedSet(); public OpenAiBackendUI(OpenAiBackendUIModel model, PluginConfiguration config, OpenAiBackend backend) { @@ -22,6 +24,7 @@ public OpenAiBackendUI(OpenAiBackendUIModel model, PluginConfiguration config, O this.model = model; this.apiKey = this.model.GetApiKey(); this.backend = backend; + } public void DrawLoginOptions() @@ -45,6 +48,7 @@ public void DrawLoginOptions() } + public void DrawVoicePresetOptions() { var currentVoicePreset = model.GetCurrentVoicePreset(); @@ -66,6 +70,7 @@ public void DrawVoicePresetOptions() .ToArray(); if (ImGui.Combo($"Voice preset##{MemoizedId.Create()}", ref currentPresetIndex, presetDisplayNames, presets.Count)) config.SetCurrentVoicePreset(presets[currentPresetIndex].Id); + currentVoicePreset.SyncSetFromString(); } else if (currentVoicePreset != null) { @@ -117,33 +122,25 @@ public void DrawVoicePresetOptions() if (currentVoicePreset.Model == null) return; var currentModel = OpenAiClient.Models.First(x => x.ModelName == currentVoicePreset.Model); - // 1. Determine what to display in the preview (the value corresponding to the current key) if (!currentModel.Voices.TryGetValue(currentVoicePreset.VoiceName ?? "", out var currentPreviewName)) { - // Fallback if current key is invalid or null currentVoicePreset.VoiceName = currentModel.Voices.Keys.First(); currentPreviewName = currentModel.Voices[currentVoicePreset.VoiceName]; config.Save(); } - // 2. Start the Combo Box with the Descriptive Value as the preview if (ImGui.BeginCombo($"Voice##{MemoizedId.Create()}", currentPreviewName)) { foreach (var voice in currentModel.Voices) { - // voice.Key is "alloy", "ash", etc. - // voice.Value is "Alloy (Neutral & Balanced)", etc. bool isSelected = (currentVoicePreset.VoiceName == voice.Key); - // 3. Display the descriptive Value to the user if (ImGui.Selectable(voice.Value, isSelected)) { - // 4. Update config with the underlying Key currentVoicePreset.VoiceName = voice.Key; config.Save(); } - // Standard ImGui accessibility: set focus to the selected item if (isSelected) { ImGui.SetItemDefaultFocus(); @@ -184,17 +181,32 @@ public void DrawVoicePresetOptions() } else { - var style = currentVoicePreset.Style; - voiceStyles.Insert(0, ""); - var styleIndex = voiceStyles.IndexOf(currentVoicePreset.Style ?? ""); - if (ImGui.Combo($"Voice Style##{MemoizedId.Create()}", ref styleIndex, voiceStyles, voiceStyles.Count)) + string previewText = currentVoicePreset.Styles.Count > 0 + ? string.Join(", ", currentVoicePreset.Styles) + : "None selected"; + + if (ImGui.BeginCombo($"Voice Style##{MemoizedId.Create()}", previewText)) { - currentVoicePreset.Style = voiceStyles[styleIndex]; - this.config.Save(); + foreach (var styleName in config.CustomVoiceStyles) + { + bool isSelected = currentVoicePreset.Styles.Contains(styleName); + + if (ImGui.Selectable(styleName, isSelected, ImGuiSelectableFlags.DontClosePopups)) + { + if (isSelected) + currentVoicePreset.Styles.Remove(styleName); + else + currentVoicePreset.Styles.Add(styleName); + + currentVoicePreset.SyncStringFromSet(); + this.config.Save(); + } + } + ImGui.EndCombo(); } } - Components.HelpTooltip(""" + Components.HelpTooltip(""" Styles are additional information that can be provided to the model to help it generate more accurate speech. This can include things like emphasis, pronunciation, pauses, tone, pacing, voice affect, inflections, word choice etc. Examples can be found at https://openai.fm diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiBackendUIModel.cs b/src/TextToTalk/Backends/OpenAI/OpenAiBackendUIModel.cs index baf707a..3067bf8 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiBackendUIModel.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiBackendUIModel.cs @@ -2,6 +2,9 @@ using System.Collections.Generic; using System.Net.Http; using System.Text.RegularExpressions; +using System.ClientModel; +using OpenAI; +using OpenAI.Models; // Ensure you have the Models namespace namespace TextToTalk.Backends.OpenAI; @@ -16,7 +19,9 @@ public class OpenAiBackendUIModel /// /// Gets the sound playback queue. /// - public StreamSoundQueue SoundQueue { get; } + public StreamingSoundQueue SoundQueue { get; } + + //public RawStreamingSoundQueue RawStreamingSoundQueue { get; } /// /// Gets the currently-instantiated OpenAI client instance. @@ -32,22 +37,10 @@ public class OpenAiBackendUIModel /// Gets the valid voices for the current voice engine. /// NOTE: Currently there is no endpoint which provides this information for OpenAI. /// - // public IReadOnlyDictionary> Voices { get; private set; } - public OpenAiBackendUIModel(PluginConfiguration config, HttpClient http) + public OpenAiBackendUIModel(PluginConfiguration config, HttpClient http, LatencyTracker latencyTracker) { - SoundQueue = new StreamSoundQueue(config); - OpenAi = new OpenAiClient(SoundQueue, http); this.config = config; - this.apiKey = ""; - - // this.Voices = new Dictionary>(); - - var credentials = OpenAiCredentialManager.LoadCredentials(); - if (credentials != null) - { - LoginWith(credentials.Password); - } } /// @@ -88,27 +81,45 @@ public void SetCurrentVoicePreset(int id) this.config.Save(); } - private bool TryLogin(string testApiKey) +private bool TryLogin(string testApiKey) +{ + OpenAiLoginException = null; + var lastApiKey = this.apiKey; + + try { - OpenAiLoginException = null; - var lastApiKey = this.apiKey; - try - { - DetailedLog.Info("Testing OpenAI authorization status"); - OpenAi.ApiKey = testApiKey; - // This should throw an exception if the API key was incorrect - OpenAi.TestCredentials().GetAwaiter().GetResult(); - DetailedLog.Info("OpenAI authorization successful"); - return true; - } - catch (Exception e) - { - OpenAiLoginException = e; - OpenAi.ApiKey = lastApiKey; - DetailedLog.Error(e, "Failed to initialize OpenAI client"); - return false; - } + DetailedLog.Info("Testing OpenAI authorization status..."); + + // 1. Initialize a temporary client with the test key + // In the v2 SDK, you can use OpenAIModelClient for a cheap validation call + var modelClient = new OpenAIModelClient(new ApiKeyCredential(testApiKey)); + + // 2. Perform a 'List Models' call. + // This is a free metadata call that requires valid authentication. + // Use GetModels() to verify credentials. + _ = modelClient.GetModels(); + + // 3. If successful, update the primary ApiKey and return true + this.apiKey = testApiKey; + DetailedLog.Info("OpenAI authorization successful."); + return true; + } + catch (ClientResultException e) + { + // Specifically catch SDK-based authentication or client errors + OpenAiLoginException = e; + this.apiKey = lastApiKey; + DetailedLog.Error(e, $"OpenAI authorization failed: {e.Status} {e.Message}"); + return false; + } + catch (Exception e) + { + OpenAiLoginException = e; + this.apiKey = lastApiKey; + DetailedLog.Error(e, "An unexpected error occurred during OpenAI initialization."); + return false; } +} public void Dispose() { diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiClient.cs b/src/TextToTalk/Backends/OpenAI/OpenAiClient.cs index 6c94868..1d51076 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiClient.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiClient.cs @@ -1,261 +1,120 @@ -using System; +using Dalamud.Bindings.ImGui; +using NAudio.CoreAudioApi; +using OpenAI; +using Serilog; +using System; +using System.ClientModel; +using System.ClientModel.Primitives; using System.Collections.Generic; +using System.Diagnostics; using System.IO; using System.Linq; -using System.Net; using System.Net.Http; +using System.Net.Http.Headers; using System.Text; using System.Text.Json; +using System.Threading; using System.Threading.Tasks; -using System.Text.RegularExpressions; -using TextToTalk.GameEnums; -using Serilog; +using OpenAIAudio = OpenAI.Audio; namespace TextToTalk.Backends.OpenAI; -public class OpenAiClient(StreamSoundQueue soundQueue, HttpClient http) +public class OpenAiClient { - private const string UrlBase = "https://api.openai.com"; + private readonly OpenAIClient _openAiClient; + private readonly StreamingSoundQueue _soundQueue; + public CancellationTokenSource? _ttsCts; + + private readonly HttpClient _httpClient = new(); public record ModelConfig( - string ModelName, - IReadOnlyDictionary Voices, - bool InstructionsSupported, - bool SpeedSupported); + string ModelName, + IReadOnlyDictionary Voices, + bool InstructionsSupported, + bool SpeedSupported); private static readonly Dictionary VoiceLabels = new() -{ - { "alloy", "Alloy (Neutral & Balanced)" }, - { "ash", "Ash (Clear & Precise)" }, - { "ballad", "Ballad (Melodic & Smooth)" }, - { "coral", "Coral (Warm & Friendly)" }, - { "echo", "Echo (Resonant & Deep)" }, - { "fable", "Fable (Alto Narrative)" }, - { "onyx", "Onyx (Deep & Energetic)" }, - { "nova", "Nova (Bright & Energetic)" }, - { "sage", "Sage (Calm & Thoughtful)" }, - { "shimmer", "Shimmer (Bright & Feminine)" }, - { "verse", "Verse (Versatile & Expressive)" }, - { "marin", "Marin (Latest and Greatest)" }, - { "cedar", "Cedar (Latest and Greatest)" } -}; + { + { "alloy", "Alloy (Neutral & Balanced)" }, + { "ash", "Ash (Clear & Precise)" }, + { "ballad", "Ballad (Melodic & Smooth)" }, + { "coral", "Coral (Warm & Friendly)" }, + { "echo", "Echo (Resonant & Deep)" }, + { "fable", "Fable (Alto Narrative)" }, + { "onyx", "Onyx (Deep & Energetic)" }, + { "nova", "Nova (Bright & Energetic)" }, + { "sage", "Sage (Calm & Thoughtful)" }, + { "shimmer", "Shimmer (Bright & Feminine)" }, + { "verse", "Verse (Versatile & Expressive)" }, + { "marin", "Marin (Latest and Greatest)" }, + { "cedar", "Cedar (Latest and Greatest)" } + }; public static readonly List Models = [ - new("gpt-4o-mini-tts", - VoiceLabels.ToDictionary(v => v.Key, v => v.Value), - true, false), - - new("tts-1", - VoiceLabels.Where(v => v.Key != "ballad" && v.Key != "verse") - .ToDictionary(v => v.Key, v => v.Value), - false, true), - - new("tts-1-hd", - VoiceLabels.Where(v => v.Key != "ballad" && v.Key != "verse") - .ToDictionary(v => v.Key, v => v.Value), - false, false) + new("gpt-4o-mini-tts", VoiceLabels.ToDictionary(v => v.Key, v => v.Value), true, true), + new("tts-1", VoiceLabels.Where(v => v.Key != "ballad" && v.Key != "verse").ToDictionary(v => v.Key, v => v.Value), false, true), + new("tts-1-hd", VoiceLabels.Where(v => v.Key != "ballad" && v.Key != "verse").ToDictionary(v => v.Key, v => v.Value), false, true) ]; - // public record ModelConfig(string ModelName, IReadOnlySet Voices, bool InstructionsSupported, bool SpeedSupported); - // - // public static readonly List Models = - // [ - // // Note: while speed is 'technically' supported by gpt-4o-mini-tts, it doesn't appear to influence the output. - // new("gpt-4o-mini-tts", new HashSet - // { - // "alloy", - // "ash", - // "ballad", - // "coral", - // "echo", - // "fable", - // "onyx", - // "nova", - // "sage", - // "shimmer", - // "verse" - // }, true, false), - // new("tts-1", new HashSet - // { - // "nova", - // "shimmer", - // "echo", - // "onyx", - // "fable", - // "alloy", - // "ash", - // "sage", - // "coral" - // }, false, true), - // new("tts-1-hd", new HashSet - // { - // "nova", - // "shimmer", - // "echo", - // "onyx", - // "fable", - // "alloy", - // "ash", - // "sage", - // "coral" - // }, false, false), - // ]; - public string? ApiKey { get; set; } - private void AddAuthorization(HttpRequestMessage req) - { - req.Headers.Add("Authorization", $"Bearer {ApiKey}"); - } - - private bool IsAuthorizationSet() + public OpenAiClient(StreamingSoundQueue soundQueue, string apiKey) { - return ApiKey is { Length: > 0 }; - } + _soundQueue = soundQueue; + ApiKey = apiKey; - public async Task TestCredentials() - { - if (!IsAuthorizationSet()) + if (!string.IsNullOrWhiteSpace(apiKey)) { - throw new OpenAiMissingCredentialsException("No OpenAI authorization keys have been configured."); + _openAiClient = new OpenAIClient(apiKey); } - - var uriBuilder = new UriBuilder(UrlBase) { Path = "/v1/models" }; - using var req = new HttpRequestMessage(HttpMethod.Get, uriBuilder.Uri); - AddAuthorization(req); - - var res = await http.SendAsync(req); - await EnsureSuccessStatusCode(res); } - public string? GetInstructionsForRequest(SayRequest request, OpenAiVoicePreset preset) + public async Task Say(string text, string modelName, TextSource source, string voiceId, string? instructions, float speed, float volume) { - var instructionBuilder = new StringBuilder(); - instructionBuilder.AppendLine($"Tone: Final fantasy 14 character named {request.Speaker}"); - if (request.Race is {Length: > 0}) - { - instructionBuilder.AppendLine($"Race: {request.Race}"); - } + long methodStart = Stopwatch.GetTimestamp(); + if (string.IsNullOrWhiteSpace(ApiKey)) return; - if (request.BodyType is not BodyType.Unknown) - { - instructionBuilder.AppendLine($"BodyType: {request.BodyType}"); - } + _ttsCts?.Cancel(); + _ttsCts = new CancellationTokenSource(); + var token = _ttsCts.Token; - if (preset.Style is {Length: > 0}) + try { - instructionBuilder.AppendLine($"Instructions: {(!string.IsNullOrEmpty(request.Style) ? request.Style : preset.Style)}"); // Style tags from Say Request take precedence over Style tags from voice preset. - } - - var instructions = instructionBuilder.ToString() - .Trim(); - - return instructions.Length > 0 ? instructions : null; - } - - public async Task Say(OpenAiVoicePreset preset, SayRequest request, string text, string style) - { - if (!IsAuthorizationSet()) + var requestBody = new Dictionary { - throw new OpenAiMissingCredentialsException("No OpenAI authorization keys have been configured."); - } - - var uriBuilder = new UriBuilder(UrlBase) { Path = "/v1/audio/speech" }; - using var req = new HttpRequestMessage(HttpMethod.Post, uriBuilder.Uri); - AddAuthorization(req); - - string model; - string voice; - if (preset.Model != null && Models.Any(m => m.ModelName == preset.Model)) - { - model = preset.Model; - } - else - { - model = Models.First().ModelName; - } - - if (request.Style is {Length: > 0 }) - { - model = "gpt-4o-mini-tts"; // Force Say request to model that can handle Voice Styles if user has embedded a style tag into their message - } - - var modelConfig = Models.First(m => m.ModelName == model); - if (preset.VoiceName != null && modelConfig.Voices.Keys.Contains(preset.VoiceName)) - { - voice = preset.VoiceName; - } - else - { - voice = modelConfig.Voices.Keys.First(); - } - - Dictionary args = new() - { - ["model"] = model, - ["input"] = text, - ["voice"] = voice, - ["response_format"] = "mp3", - ["speed"] = modelConfig.SpeedSupported ? preset.PlaybackRate ?? 1.0f : 1.0f + { "model", modelName }, + { "input", text }, + { "voice", voiceId.ToLowerInvariant() }, + { "response_format", "pcm" }, + { "speed", speed } }; - if (modelConfig.InstructionsSupported) - { - string? configinstructions = GetInstructionsForRequest(request, preset); - //if (style != "") - //{ - // args["instructions"] = style; - //} - // Instructions from style take precedence over preset instructions. - if (configinstructions != null) + // Check if model supports instructions (gpt-4o-mini-tts) + var modelCfg = Models.FirstOrDefault(m => m.ModelName == modelName); + if (modelCfg != null && modelCfg.InstructionsSupported && !string.IsNullOrEmpty(instructions)) { - args["instructions"] = configinstructions; + requestBody["instructions"] = instructions; } - } - var json = JsonSerializer.Serialize(args); - DetailedLog.Verbose(json); - using var content = new StringContent(json, Encoding.UTF8, "application/json"); - req.Content = content; - var res = await http.SendAsync(req); - await EnsureSuccessStatusCode(res); + using var request = new HttpRequestMessage(HttpMethod.Post, "https://api.openai.com/v1/audio/speech"); + request.Headers.Authorization = new AuthenticationHeaderValue("Bearer", ApiKey); + request.Content = new StringContent(JsonSerializer.Serialize(requestBody), Encoding.UTF8, "application/json"); - var mp3Stream = new MemoryStream(); - var responseStream = await res.Content.ReadAsStreamAsync(); - await responseStream.CopyToAsync(mp3Stream); - mp3Stream.Seek(0, SeekOrigin.Begin); + var response = await _httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, token); + response.EnsureSuccessStatusCode(); - soundQueue.EnqueueSound(mp3Stream, request.Source, StreamFormat.Mp3, preset.Volume); - } + var responseStream = await response.Content.ReadAsStreamAsync(token); - private static async Task EnsureSuccessStatusCode(HttpResponseMessage res) - { - if (res.StatusCode is HttpStatusCode.Unauthorized or HttpStatusCode.Forbidden) + _soundQueue.EnqueueSound(responseStream, source, volume, StreamFormat.Wave, null, methodStart); + } + catch (OperationCanceledException) { - throw new OpenAiUnauthorizedException(res.StatusCode, "Unauthorized request."); + Log.Information("OpenAI Speech generation was cancelled."); } - - if (!res.IsSuccessStatusCode) + catch (Exception ex) { - try - { - var content = await res.Content.ReadAsStringAsync(); - DetailedLog.Debug(content); - - var error = JsonSerializer.Deserialize(content); - if (error?.Error != null) - { - throw new OpenAiFailedException(res.StatusCode, error.Error, - $"Request failed with status code {error.Error.Code}: {error.Error.Message}"); - } - } - catch (Exception e) when (e is not OpenAiFailedException) - { - DetailedLog.Error(e, "Failed to parse OpenAI error response."); - } - - throw new OpenAiFailedException(res.StatusCode, null, $"Request failed with status code {res.StatusCode}."); + Log.Error(ex, "OpenAI REST Speech generation failed."); } } -} \ No newline at end of file +} diff --git a/src/TextToTalk/Backends/OpenAI/OpenAiVoicePreset.cs b/src/TextToTalk/Backends/OpenAI/OpenAiVoicePreset.cs index 8fe4b86..14898a3 100644 --- a/src/TextToTalk/Backends/OpenAI/OpenAiVoicePreset.cs +++ b/src/TextToTalk/Backends/OpenAI/OpenAiVoicePreset.cs @@ -1,4 +1,5 @@ -using System.Linq; +using System.Collections.Generic; +using System.Linq; using System.Text.Json.Serialization; namespace TextToTalk.Backends.OpenAI; @@ -13,8 +14,26 @@ public class OpenAiVoicePreset : VoicePreset public float? PlaybackRate { get; set; } [JsonPropertyName("OpenAIVoiceName")] public string? VoiceName { get; set; } - - public string? Style { get; set; } + + public string? Style { get; set; } = ""; + + [JsonIgnore] public SortedSet Styles { get; set; } = new SortedSet(); + + + public void SyncSetFromString() + { + Styles.Clear(); + if (string.IsNullOrWhiteSpace(Style)) return; + + foreach (var s in Style.Split(", ")) + Styles.Add(s); + } + + // Call this whenever the UI changes the Set to update the String + public void SyncStringFromSet() + { + Style = string.Join(", ", Styles); + } public override bool TrySetDefaultValues() { diff --git a/src/TextToTalk/Backends/Piper/PiperBackend.cs b/src/TextToTalk/Backends/Piper/PiperBackend.cs new file mode 100644 index 0000000..015d966 --- /dev/null +++ b/src/TextToTalk/Backends/Piper/PiperBackend.cs @@ -0,0 +1,390 @@ +using Dalamud.Bindings.ImGui; +using Dalamud.Game; +using Microsoft.ML.OnnxRuntime; +using PiperSharp; +using PiperSharp.Models; +using Serilog; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Linq; +using System.Net.Http; +using System.Threading; +using System.Threading.Tasks; +using TextToTalk.Backends.Kokoro; +using TextToTalk.Backends.Piper; + +namespace TextToTalk.Backends.Piper; + +public class PiperBackend : VoiceBackend +{ + private readonly PiperProvider piper; + private readonly PiperBackendUI ui; + private readonly StreamingSoundQueue soundQueue; + private readonly Task modelTask; + private CancellationTokenSource cts = new(); + private readonly PluginConfiguration config; + + private Process? piperServerProcess; + private readonly object processLock = new(); + private readonly LatencyTracker latencyTracker; + + public string GetVoicesDir(PluginConfiguration config) => + Path.Combine(config.GetPluginConfigDirectory(), "piper", "voices"); + + public PiperBackend(PluginConfiguration config, LatencyTracker latencyTracker) + { + this.ui = new PiperBackendUI(config, this); + + // 1. Point to the nested 'piper' subfolder created by ExtractPiper + string piperBaseDir = Path.Combine(config.GetPluginConfigDirectory(), "piper", "piper"); + string piperExe = Path.Combine(piperBaseDir, "piper.exe"); + + this.piper = new PiperProvider(new PiperConfiguration() + { + ExecutableLocation = piperExe, + WorkingDirectory = piperBaseDir + }); + + this.modelTask = LoadOrDownloadModelAsync(config); + this.soundQueue = new StreamingSoundQueue(config, latencyTracker); + this.config = config; + this.latencyTracker = latencyTracker; + } + + public async Task> GetAvailableModels() + { + return await PiperDownloader.GetHuggingFaceModelList(); + } + public static bool IsModelFileDownloaded(PluginConfiguration config) + { + var piperExePath = Path.Combine(config.GetPluginConfigDirectory(), "piper", "piper", "piper.exe"); + return File.Exists(piperExePath); + } + + /// + /// Downloads a specific model and initializes its folder structure. + /// + public async Task DownloadSpecificModel(string modelKey, VoiceModel entry) + { + string voicesDir = GetVoicesDir(config); + string modelTargetDir = Path.Combine(voicesDir, modelKey); + + try + { + DetailedLog.Info($"Downloading voice: {modelKey}"); + await entry.DownloadModel(voicesDir); + + string onnxPath = Path.Combine(modelTargetDir, $"{modelKey}.onnx"); + await LoadSpecificVoiceModel(onnxPath); + + DetailedLog.Info($"Successfully installed {modelKey}"); + } + catch (Exception ex) + { + DetailedLog.Error($"Failed to download {modelKey}: {ex.Message}"); + } + } + + public bool DeleteVoiceModel(string modelKey) + { + try + { + // 1. Kill any active speech to unlock files + KillActiveProcessInternal(); + + string voicesDir = GetVoicesDir(config); + string modelTargetDir = Path.Combine(voicesDir, modelKey); + + if (Directory.Exists(modelTargetDir)) + { + Directory.Delete(modelTargetDir, true); + DetailedLog.Info($"Deleted voice model: {modelKey}"); + return true; + } + } + catch (Exception ex) + { + DetailedLog.Error($"Failed to delete {modelKey}: {ex.Message}"); + } + return false; + } + + /// Downloads the Piper executable and initial voice models. + public async Task EnsurePiperAssetsDownloaded(PluginConfiguration config) + { + string configDir = config.GetPluginConfigDirectory(); + string voicesDir = GetVoicesDir(config); + + await EnsureExecutableDownloaded(config); + + var allModels = await PiperDownloader.GetHuggingFaceModelList(); + + // TARGET: Only download "en_US-lessac-medium" initially + string starterModelKey = "en_US-lessac-medium"; + + if (allModels.TryGetValue(starterModelKey, out var modelEntry)) + { + string modelTargetDir = Path.Combine(voicesDir, starterModelKey); + + // Skip if already downloaded + if (!File.Exists(Path.Combine(modelTargetDir, $"{starterModelKey}.onnx"))) + { + try + { + DetailedLog.Info($"Downloading starter English voice: {starterModelKey}"); + + await modelEntry.DownloadModel(voicesDir); + + string onnxPath = Path.Combine(modelTargetDir, $"{starterModelKey}.onnx"); + + await LoadSpecificVoiceModel(onnxPath); + } + catch (Exception ex) + { + DetailedLog.Error($"Failed to download starter voice {starterModelKey}: {ex.Message}"); + } + } + else + { + DetailedLog.Info($"Starter voice {starterModelKey} already exists."); + } + } + else + { + DetailedLog.Error($"Starter voice {starterModelKey} not found in the Hugging Face model list."); + } + } + + private async Task EnsureExecutableDownloaded(PluginConfiguration config) + { + if (!IsModelFileDownloaded(config)) + { + string piperDir = Path.Combine(config.GetPluginConfigDirectory(), "piper"); + DetailedLog.Info("Piper executable missing. Downloading..."); + await PiperDownloader.DownloadPiper().ExtractPiper(piperDir); + } + } + + private async Task LoadOrDownloadModelAsync(PluginConfiguration config) + { + + string modelKey = "en_US-lessac-medium"; + string modelDir = Path.Combine(GetVoicesDir(config), modelKey); + string onnxFilePath = Path.Combine(modelDir, $"{modelKey}.onnx"); + + if (!File.Exists(onnxFilePath)) + { + await EnsurePiperAssetsDownloaded(config); + } + + return await LoadSpecificVoiceModel(onnxFilePath); + } + + private async Task LoadSpecificVoiceModel(string onnxFilePath) + { + string modelDir = Path.GetDirectoryName(onnxFilePath); + string configFilePath = onnxFilePath + ".json"; + string piperSharpExpectedJson = Path.Combine(modelDir, "model.json"); + + if (File.Exists(configFilePath) && !File.Exists(piperSharpExpectedJson)) + { + File.Copy(configFilePath, piperSharpExpectedJson, true); + } + + return await VoiceModel.LoadModel(modelDir); + } + + private bool TryGetModel([NotNullWhen(true)] out VoiceModel? tts) + { + if (modelTask.IsCompletedSuccessfully) + { + tts = modelTask.Result; + return true; + } + + tts = null; + return false; + } + + public override void Say(SayRequest request) + { + if (request.Voice is not PiperVoicePreset voicePreset) + throw new InvalidOperationException("Invalid voice preset."); + + if (!modelTask.IsCompletedSuccessfully) return; + + Task.Run(async () => await Say(request.Text, (PiperVoicePreset)request.Voice, request.Source)); + } + + public async Task Say(string text, PiperVoicePreset voicePreset, TextSource source) + { + long? timestampToPass = Stopwatch.GetTimestamp(); + + // 1. Validation + if (string.IsNullOrEmpty(voicePreset.ModelPath) || !File.Exists(voicePreset.ModelPath)) + { + DetailedLog.Error($"Piper model file not found: {voicePreset.ModelPath}"); + return; + } + + try + { + // 2. Prepare Model and Arguments + var voiceDir = Path.GetDirectoryName(voicePreset.ModelPath); + piper.Configuration.Model = await VoiceModel.LoadModel(voiceDir); + piper.Configuration.SpeakingRate = 1.0f / (voicePreset.Speed ?? 1f); + + string args = piper.Configuration.BuildArguments(); + + // 3. Initialize Process + var process = new Process(); + process.StartInfo = new ProcessStartInfo + { + FileName = piper.Configuration.ExecutableLocation, + Arguments = args, + RedirectStandardInput = true, + RedirectStandardOutput = true, + UseShellExecute = false, + CreateNoWindow = true, + + }; + + // 4. Thread-Safe Process Management + lock (processLock) + { + // Kill any dangling process before starting a new one + KillActiveProcessInternal(); + piperServerProcess = process; + } + + // 5. THE CANCELLATION BRIDGE + using var registration = cts.Token.Register(() => KillActiveProcessInternal()); + + process.Start(); + + // 6. Check for cancellation before writing to the pipe + if (cts.Token.IsCancellationRequested) + throw new OperationCanceledException(cts.Token); + + // 7. Write Text to StandardInput + using (var sw = new StreamWriter(process.StandardInput.BaseStream, leaveOpen: false)) + { + await sw.WriteLineAsync(text); + await sw.FlushAsync(); + } + + // 8. Determine Audio Format + var format = voicePreset.InternalName switch + { + string name when name.EndsWith("low") => StreamFormat.Wave16K, + string name when name.EndsWith("high") => StreamFormat.Wave, + _ => StreamFormat.Wave22K // Defaults to Medium/Standard + }; + + // 9. Enqueue Stream + soundQueue.EnqueueSound(process.StandardOutput.BaseStream, source, voicePreset.Volume ?? 1f, format, null, timestampToPass); + + // 10. Await process exit + await process.WaitForExitAsync(cts.Token); + } + catch (OperationCanceledException) + { + Log.Information("Piper synthesis task was cancelled."); + } + catch (Exception ex) + { + DetailedLog.Error($"Piper streaming failed: {ex.Message}"); + } + finally + { + KillActiveProcessInternal(); + } + } + + public void KillActiveProcessInternal() + { + lock (processLock) + { + if (piperServerProcess != null) + { + try + { + piperServerProcess.Kill(true); + } + catch (Exception ex) + { + DetailedLog.Debug($"Error killing piper process: {ex.Message}"); + } + finally + { + piperServerProcess.Dispose(); + piperServerProcess = null; + } + } + } + } + + public override void CancelAllSpeech() + { + KillActiveProcessInternal(); + + soundQueue.CancelAllSounds(); + soundQueue.StopHardware(); + cts.Cancel(); + cts.Dispose(); + cts = new CancellationTokenSource(); + } + + public override void CancelSay(TextSource source) + { + KillActiveProcessInternal(); + + soundQueue.CancelFromSource(source); + soundQueue.StopHardware(); + cts.Cancel(); + cts.Dispose(); + cts = new CancellationTokenSource(); + } + + public override void DrawSettings(IConfigUIDelegates helpers) + { + if (TryGetModel(out _)) + { + ui.DrawVoicePresetOptions(); + return; + } + + if (modelTask.Status == TaskStatus.Faulted) + { + ImGui.TextColored(ImColor.Red, $"Failed to download model: {modelTask.Exception?.Message}"); + DetailedLog.Error($"Failed to download Piper model: {modelTask.Exception}"); + } + else + { + ImGui.TextColored(ImColor.HintColor, "Model is still downloading or initializing..."); + } + } + + public override TextSource GetCurrentlySpokenTextSource() + { + return soundQueue.GetCurrentlySpokenTextSource(); + } + public override void DrawStyles(IConfigUIDelegates helpers) + { + helpers.OpenVoiceStylesConfig(); + } + protected override void Dispose(bool disposing) + { + cts.Cancel(); + + if (disposing) + { + soundQueue?.Dispose(); + cts.Dispose(); + } + } + +} \ No newline at end of file diff --git a/src/TextToTalk/Backends/Piper/PiperBackendUI.cs b/src/TextToTalk/Backends/Piper/PiperBackendUI.cs new file mode 100644 index 0000000..85a6604 --- /dev/null +++ b/src/TextToTalk/Backends/Piper/PiperBackendUI.cs @@ -0,0 +1,425 @@ +using Dalamud.Bindings.ImGui; +using Dalamud.Utility; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text.Json; +using System.Threading.Tasks; +using TextToTalk.UI; +using PiperSharp; +using PiperSharp.Models; + +namespace TextToTalk.Backends.Piper; + +public class PiperBackendUI(PluginConfiguration config, PiperBackend piperBackend) +{ + private string[] availableModelPaths; + private string[] modelDisplayNames; + private int selectedModelIndex = -1; + + private bool showDownloader = false; + private IDictionary remoteModels; + private string searchQuery = ""; + private List cachedModels = new(); // For live list updates + private DateTime lastScan = DateTime.MinValue; + private bool isScanning = false; + + private HashSet activeDownloads = new HashSet(); + + public class PiperModelInfo + { + public string FullPath { get; set; } + public string DisplayName { get; set; } + public string Quality { get; set; } + public string LanguageName { get; set; } + + public static PiperModelInfo FromPath(string onnxPath) + { + var jsonPath = onnxPath + ".json"; + if (!File.Exists(jsonPath)) return null; + + try + { + var json = File.ReadAllText(jsonPath); + using var doc = JsonDocument.Parse(json); + var root = doc.RootElement; + + var langCode = root.GetProperty("language").GetProperty("code").GetString(); + var langPlain = root.GetProperty("language").GetProperty("name_english").GetString(); + + var prettyLang = GetPrettyLanguageName(langCode, langPlain); + var dataset = root.GetProperty("dataset").GetString(); + + return new PiperModelInfo + { + FullPath = onnxPath, + LanguageName = prettyLang, + DisplayName = dataset ?? "Unknown", + Quality = root.GetProperty("audio").GetProperty("quality").GetString()?.ToLower() ?? "medium" + }; + } + catch { return null; } + } + } + + private List sortedModels = new(); + private string[] sortedDisplayNames = Array.Empty(); + private string voicesFolderSize = "0 MB"; + + public static string GetPrettyLanguageName(string code, string fallbackName) + { + if (string.IsNullOrEmpty(code)) return fallbackName ?? "Unknown"; + + return code.ToLower().Replace("-", "_") switch + { + "en_gb" => "English - UK", + "en_us" => "English - US", + "es_ar" => "Spanish - AR", + "es_es" => "Spanish - ES", + "es_mx" => "Spanish - MX", + "nl_be" => "Dutch - BE", + "nl_nl" => "Dutch - NL", + _ => fallbackName + }; + } + + private void DrawLoadingSpinner(string label, float radius, float thickness, uint color) + { + // 1. Get current cursor position to draw + var pos = ImGui.GetCursorScreenPos(); + var size = new global::System.Numerics.Vector2(radius * 2, radius * 2); + + // 2. Reserve space in the ImGui layout so other elements don't overlap + ImGui.Dummy(size); + + // 3. Define the center of our circle + var center = new global::System.Numerics.Vector2(pos.X + radius, pos.Y + radius); + var drawList = ImGui.GetWindowDrawList(); + + // 4. Calculate animation timing + float time = (float)ImGui.GetTime(); + int numSegments = 30; + float startAngle = time * 8.0f; // Rotation speed + + // 5. Build the arc path (approx. 270 degrees) + drawList.PathClear(); + for (int i = 0; i <= numSegments; i++) + { + float a = startAngle + ((float)i / numSegments) * (MathF.PI * 1.5f); + drawList.PathLineTo(new global::System.Numerics.Vector2( + center.X + MathF.Cos(a) * radius, + center.Y + MathF.Sin(a) * radius)); + } + + // 6. Draw the stroke + drawList.PathStroke(color, ImDrawFlags.None, thickness); + } + + public void DrawVoicePresetOptions() + { + ImGui.TextColored(ImColor.HintColor, "Piper is a local neural TTS engine. Ensure you have downloaded models."); + + ImGui.Spacing(); + + var currentVoicePreset = config.GetCurrentVoicePreset(); + var presets = config.GetVoicePresetsForBackend(TTSBackend.Piper).ToList(); + + if (presets.Count > 0 && currentVoicePreset != null) + { + var presetIndex = currentVoicePreset is not null ? presets.IndexOf(currentVoicePreset) : -1; + if (ImGui.Combo($"Voice preset##{MemoizedId.Create()}", ref presetIndex, + presets.Select(p => p.Name).ToArray(), presets.Count)) + { + config.SetCurrentVoicePreset(presets[presetIndex].Id); + config.Save(); + selectedModelIndex = -1; + } + } + else if (currentVoicePreset != null) + { + ImGui.TextColored(ImColor.Red, "You have no presets. Create one to begin."); + } + else if (currentVoicePreset == null && presets.Count > 0) + { + config.SetCurrentVoicePreset(presets.First().Id); + } + + BackendUI.NewPresetButton($"New preset##{MemoizedId.Create()}", config); + + if (presets.Count == 0 || currentVoicePreset is null) return; + + ImGui.SameLine(); + BackendUI.DeletePresetButton($"Delete preset##{MemoizedId.Create()}", currentVoicePreset, TTSBackend.Piper, config); + + ImGui.Separator(); + + var presetName = currentVoicePreset.Name; + if (ImGui.InputText($"Preset name##{MemoizedId.Create()}", ref presetName, 64)) + { + currentVoicePreset.Name = presetName; + config.Save(); + } + + if (!isScanning && (DateTime.Now - lastScan).TotalSeconds > 3) + { + lastScan = DateTime.Now; + isScanning = true; + + Task.Run(() => + { + try + { + var voicesDir = Path.Combine(config.GetPluginConfigDirectory(), "piper", "voices"); + if (Directory.Exists(voicesDir)) + { + var files = Directory.GetFiles(voicesDir, "*.onnx", SearchOption.AllDirectories); + cachedModels = files.Select(PiperModelInfo.FromPath).Where(m => m != null).ToList(); + + var dirInfo = new DirectoryInfo(voicesDir); + long totalBytes = dirInfo.EnumerateFiles("*", SearchOption.AllDirectories).Sum(fi => fi.Length); + + voicesFolderSize = $"{(totalBytes / 1024f / 1024f):N0} MB"; // Display current size of Piper Voice Directory + } + } + finally { isScanning = false; } + }); + } + + // Model Selection + var voicesDir = Path.Combine(config.GetPluginConfigDirectory(), "piper", "voices"); + if (!Directory.Exists(voicesDir)) Directory.CreateDirectory(voicesDir); + + var files = Directory.GetFiles(voicesDir, "*.onnx", SearchOption.AllDirectories); + var allModels = files.Select(PiperModelInfo.FromPath).Where(m => m != null).ToList(); + + if (allModels.Count > 0) + { + var currentModel = allModels.FirstOrDefault(m => m.FullPath == currentVoicePreset.ModelPath); + + string previewValue = currentModel != null + ? $"{currentModel.LanguageName} : {currentModel.DisplayName} ({currentModel.Quality})" + : "Select a model..."; + + if (ImGui.BeginCombo($"##ModelSelect{MemoizedId.Create()}", previewValue)) + { + + var languageGroups = allModels + .GroupBy(m => m.LanguageName) + .OrderBy(g => g.Key); + + foreach (var group in languageGroups) + { + ImGui.Spacing(); + ImGui.TextDisabled($"--- {group.Key.ToUpper()} ---"); + ImGui.Separator(); + + foreach (var model in group.OrderBy(m => m.DisplayName)) + { + bool isSelected = currentVoicePreset.ModelPath == model.FullPath; + + string itemLabel = $"{model.LanguageName} : {model.DisplayName} ({model.Quality})"; + + if (ImGui.Selectable($"{itemLabel}##{model.FullPath}", isSelected)) + { + currentVoicePreset.ModelPath = model.FullPath; + currentVoicePreset.InternalName = Path.GetFileNameWithoutExtension(model.FullPath); + config.Save(); + } + + if (isSelected) ImGui.SetItemDefaultFocus(); + } + } + ImGui.EndCombo(); + } + ImGui.SameLine(); + ImGui.Text("Voice Model Selection"); + } + else + { + ImGui.TextColored(ImColor.Red, "No voice models found."); + } + + // --- Voice Parameters --- + var speed = currentVoicePreset.Speed ?? 1f; + if (ImGui.SliderFloat($"Speed##{MemoizedId.Create()}", ref speed, 0.5f, 3.0f, "%.2fx")) + { + currentVoicePreset.Speed = speed; + config.Save(); + } + + var volume = (int)((currentVoicePreset.Volume ?? 1.0f) * 100); + if (ImGui.SliderInt($"Volume##{MemoizedId.Create()}", ref volume, 0, 200, "%d%%")) + { + currentVoicePreset.Volume = MathF.Round((float)volume / 100, 2); + config.Save(); + } + + if (ImGui.Button($"Test##{MemoizedId.Create()}")) + { + if (!string.IsNullOrEmpty(currentVoicePreset.ModelPath) && File.Exists(currentVoicePreset.ModelPath)) + { + piperBackend.CancelSay(TextSource.Chat); + piperBackend.Say($"Hello from Piper neural engine. This is a test message", currentVoicePreset, + TextSource.Chat); + } + } + ImGui.SameLine(); + if (ImGui.Button($"Open Voice Downloader##{MemoizedId.Create()}")) + { + showDownloader = true; + Task.Run(async () => + { + try + { + var models = await piperBackend.GetAvailableModels(); + remoteModels = models.ToDictionary(k => k.Key, v => (VoiceModel)v.Value); + } + catch (Exception ex) + { + DetailedLog.Error($"Failed to fetch Piper manifest: {ex.Message}"); + } + }); + } + Components.Tooltip("Browse and download specific Piper voices from Hugging Face."); + ImGui.SameLine(); + ImGui.TextDisabled($"Local Storage Used: {voicesFolderSize}"); + if (ImGui.IsItemHovered()) + { + ImGui.SetTooltip("Total disk space used by downloaded Piper voice models."); + } + if (showDownloader) DrawVoiceDownloader(); + + ImGui.Separator(); + + ConfigComponents.ToggleUseGenderedVoicePresets($"Use gendered voices##{MemoizedId.Create()}", config); + if (config.UseGenderedVoicePresets) + { + BackendUI.GenderedPresetConfig("Piper", TTSBackend.Piper, config, presets); + } + } + private void DrawVoiceDownloader() + { + + ImGui.SetNextWindowSize(new global::System.Numerics.Vector2(500, 600), ImGuiCond.FirstUseEver); + if (ImGui.Begin("Piper Voice Downloader", ref showDownloader)) + { + if (remoteModels == null) + { + ImGui.Text("Fetching model list from Hugging Face..."); + ImGui.End(); + return; + } + + ImGui.InputTextWithHint("##Search", "Search voices (e.g. 'en_US' or 'medium')...", ref searchQuery, 64); + + ImGui.BeginChild("ModelList", new global::System.Numerics.Vector2(0, 0), true); + foreach (var model in remoteModels) + { + + var entry = model.Value; + var langCode = entry.Language?.Code ?? "unknown"; + + var langName = langCode.ToLower().Replace("-", "_") switch + { + "en_gb" => "English - UK", + "en_us" => "English - US", + "es_ar" => "Spanish - AR", + "es_es" => "Spanish - ES", + "es_mx" => "Spanish - MX", + "nl_be" => "Dutch - BE", + "nl_nl" => "Dutch - NL", + _ => entry.Language?.Name ?? "Unknown" // Fallback to original name + }; + var dataset = entry.Name ?? "Standard"; + var parts = (entry.Key ?? "unknown").Split(new[] { '-', '_' }, StringSplitOptions.RemoveEmptyEntries); + string quality = parts.Last().ToLower(); + + string formattedName = $"{langName} : {dataset} ({quality})"; + + if (!string.IsNullOrEmpty(searchQuery) && !formattedName.Contains(searchQuery, StringComparison.OrdinalIgnoreCase)) + continue; + + // Check if installed + bool isDownloaded = cachedModels.Any(m => m.FullPath.Contains(model.Key)); + bool isInstalled = cachedModels.Any(m => m.FullPath.Contains(Path.Combine("voices", model.Key))); + + ImGui.PushID(model.Key); + ImGui.TextUnformatted(formattedName); + ImGui.SameLine(ImGui.GetWindowWidth() - 120); + + if (isInstalled) + { + ImGui.TextColored(new global::System.Numerics.Vector4(0.5f, 1f, 0.5f, 1f), "Installed"); + + // Add a small delete button to the right of the "Installed" text + ImGui.SameLine(ImGui.GetWindowWidth() - 40); + ImGui.PushStyleColor(ImGuiCol.Button, new global::System.Numerics.Vector4(0.6f, 0.2f, 0.2f, 1f)); + if (ImGui.Button("X##Delete")) + { + if (piperBackend.DeleteVoiceModel(model.Key)) + { + lastScan = DateTime.MinValue; + } + } + ImGui.PopStyleColor(); + if (ImGui.IsItemHovered()) ImGui.SetTooltip("Delete this voice model from your computer."); + } + else + { + // CHECK: Is this specific model currently downloading? + if (activeDownloads.Contains(model.Key)) + { + // Place the spinner where the button would normally be + DrawLoadingSpinner($"##spinner_{model.Key}", 10.0f, 3.0f, ImGui.GetColorU32(ImGuiCol.ButtonHovered)); + ImGui.SameLine(); + ImGui.Text("Downloading..."); + } + else + { + if (ImGui.Button("Download")) + { + activeDownloads.Add(model.Key); + + _ = piperBackend.DownloadSpecificModel(model.Key, (VoiceModel)model.Value) + .ContinueWith(t => + { + lastScan = DateTime.MinValue; + activeDownloads.Remove(model.Key); + }); + } + } + } + ImGui.Separator(); + ImGui.PopID(); + } + ImGui.EndChild(); + ImGui.End(); + } + + } + public bool DeleteVoiceModel(string modelKey) + { + try + { + // 1. Kill any active speech to unlock files + piperBackend.KillActiveProcessInternal(); + + string voicesDir = piperBackend.GetVoicesDir(config); + string modelTargetDir = Path.Combine(voicesDir, modelKey); + + if (Directory.Exists(modelTargetDir)) + { + // Delete the folder and all contents (.onnx, .json) + Directory.Delete(modelTargetDir, true); + DetailedLog.Info($"Deleted voice model: {modelKey}"); + return true; + } + } + catch (Exception ex) + { + DetailedLog.Error($"Failed to delete {modelKey}: {ex.Message}"); + } + return false; + } +} \ No newline at end of file diff --git a/src/TextToTalk/Backends/Piper/PiperSoundQueueItem.cs b/src/TextToTalk/Backends/Piper/PiperSoundQueueItem.cs new file mode 100644 index 0000000..02a9ead --- /dev/null +++ b/src/TextToTalk/Backends/Piper/PiperSoundQueueItem.cs @@ -0,0 +1,29 @@ +using Dalamud.Game; +using TextToTalk.Backends.Piper; + +namespace TextToTalk.Backends.Piper; + +public class PiperSoundQueueItem : SoundQueueItem +{ + public string Text { get; } + public PiperVoicePreset Voice { get; } + + public float Speed { get; } + public float Volume { get; } + public bool Aborted { get; private set; } + public ClientLanguage Language { get; } + + public long? StartTime { get; set; } + + public PiperSoundQueueItem(string text, PiperVoicePreset voice, TextSource source, ClientLanguage language, long? startTime) + { + Text = text; + Voice = voice; + Source = source; + Language = language; + StartTime = startTime; + + Speed = voice.Speed ?? 1.0f; + Volume = voice.Volume ?? 1.0f; + } +} \ No newline at end of file diff --git a/src/TextToTalk/Backends/Piper/PiperVoicePreset.cs b/src/TextToTalk/Backends/Piper/PiperVoicePreset.cs new file mode 100644 index 0000000..dccacb8 --- /dev/null +++ b/src/TextToTalk/Backends/Piper/PiperVoicePreset.cs @@ -0,0 +1,26 @@ +using Newtonsoft.Json; + +namespace TextToTalk.Backends.Piper; + +public class PiperVoicePreset : VoicePreset +{ + [JsonProperty("ModelName")] + public string? InternalName { get; set; } + + [JsonProperty("ModelPath")] + public string? ModelPath { get; set; } + + public float? Speed { get; set; } + + public float? Volume { get; set; } + + public override bool TrySetDefaultValues() + { + InternalName = "en_US-lessac-medium"; + ModelPath = ""; // To be populated by the file picker or downloader + Speed = 1.0f; + Volume = 1.0f; + EnabledBackend = TTSBackend.Piper; + return true; + } +} \ No newline at end of file diff --git a/src/TextToTalk/Backends/Polly/PollyBackend.cs b/src/TextToTalk/Backends/Polly/PollyBackend.cs index 7042ccb..ea573ac 100644 --- a/src/TextToTalk/Backends/Polly/PollyBackend.cs +++ b/src/TextToTalk/Backends/Polly/PollyBackend.cs @@ -9,17 +9,18 @@ public class PollyBackend : VoiceBackend { private readonly PollyBackendUI ui; private readonly PollyBackendUIModel uiModel; + private readonly LatencyTracker latencyTracker; - public PollyBackend(PluginConfiguration config, HttpClient http) + public PollyBackend(PluginConfiguration config, HttpClient http, LatencyTracker latencyTracker) { TitleBarColor = ImGui.ColorConvertU32ToFloat4(0xFF0099FF); var lexiconManager = new DalamudLexiconManager(); - this.uiModel = new PollyBackendUIModel(config, lexiconManager); + this.uiModel = new PollyBackendUIModel(config, lexiconManager, latencyTracker); LexiconUtils.LoadFromConfigPolly(lexiconManager, config); - this.ui = new PollyBackendUI(this.uiModel, config, lexiconManager, http, this); + this.ui = new PollyBackendUI(this.uiModel, config, lexiconManager, http, this, latencyTracker); } public override void DrawStyles(IConfigUIDelegates helpers) diff --git a/src/TextToTalk/Backends/Polly/PollyBackendUI.cs b/src/TextToTalk/Backends/Polly/PollyBackendUI.cs index 8fc8150..c091830 100644 --- a/src/TextToTalk/Backends/Polly/PollyBackendUI.cs +++ b/src/TextToTalk/Backends/Polly/PollyBackendUI.cs @@ -2,6 +2,7 @@ using Dalamud.Bindings.ImGui; using Dalamud.Game; using Dalamud.Game.Text; +using FFXIVClientStructs; using System; using System.IO; using System.Linq; @@ -19,12 +20,13 @@ public class PollyBackendUI private readonly LexiconComponent lexiconComponent; private readonly PollyBackendUIModel model; private readonly PollyBackend backend; + private readonly LatencyTracker latencyTracker; private string accessKey; private string secretKey; public PollyBackendUI(PollyBackendUIModel model, PluginConfiguration config, LexiconManager lexiconManager, - HttpClient http, PollyBackend backend) + HttpClient http, PollyBackend backend, LatencyTracker latencyTracker) { this.model = model; @@ -39,7 +41,9 @@ public PollyBackendUI(PollyBackendUIModel model, PluginConfiguration config, Lex new LexiconComponent(lexiconManager, lexiconRepository, config, () => config.PollyLexiconFiles); (this.accessKey, this.secretKey) = this.model.GetKeyPair(); + this.latencyTracker = latencyTracker; } + public void DrawSettings(IConfigUIDelegates helpers) { @@ -58,7 +62,7 @@ public void DrawSettings(IConfigUIDelegates helpers) if (ImGui.Button($"Save and Login##{MemoizedId.Create()}")) { - this.model.LoginWith(this.accessKey, this.secretKey); + this.model.LoginWith(this.accessKey, this.secretKey, this.latencyTracker); } ImGui.SameLine(); diff --git a/src/TextToTalk/Backends/Polly/PollyBackendUIModel.cs b/src/TextToTalk/Backends/Polly/PollyBackendUIModel.cs index 9bef8aa..5213523 100644 --- a/src/TextToTalk/Backends/Polly/PollyBackendUIModel.cs +++ b/src/TextToTalk/Backends/Polly/PollyBackendUIModel.cs @@ -46,7 +46,7 @@ public class PollyBackendUIModel : IDisposable /// public string[] Engines { get; } = { Engine.Neural, Engine.Standard, Engine.Generative, Engine.LongForm }; - public PollyBackendUIModel(PluginConfiguration config, LexiconManager lexiconManager) + public PollyBackendUIModel(PluginConfiguration config, LexiconManager lexiconManager, LatencyTracker latencyTracker) { this.config = config; this.lexiconManager = lexiconManager; @@ -59,7 +59,7 @@ public PollyBackendUIModel(PluginConfiguration config, LexiconManager lexiconMan this.keyPair.AccessKey = credentials.UserName; this.keyPair.SecretKey = credentials.Password; - TryPollyLogin(GetCurrentRegion()); + TryPollyLogin(GetCurrentRegion(), latencyTracker); } } @@ -75,13 +75,13 @@ public PollyKeyPair GetKeyPair() /// /// The client's access key. /// The client's secret access key. - public void LoginWith(string accessKey, string secretKey) + public void LoginWith(string accessKey, string secretKey, LatencyTracker latencyTracker) { var username = Whitespace.Replace(accessKey, ""); var password = Whitespace.Replace(secretKey, ""); this.keyPair = new PollyKeyPair { AccessKey = username, SecretKey = password }; - if (TryPollyLogin(GetCurrentRegion())) + if (TryPollyLogin(GetCurrentRegion(), latencyTracker)) { // Only save the user's new credentials if the login succeeded PollyCredentialManager.SaveCredentials(username, password); @@ -173,7 +173,7 @@ public void SetCurrentEngine(Engine engine) this.config.Save(); } - private bool TryPollyLogin(RegionEndpoint regionEndpoint) + private bool TryPollyLogin(RegionEndpoint regionEndpoint, LatencyTracker latencyTracker) { PollyLoginException = null; Polly?.Dispose(); @@ -181,7 +181,7 @@ private bool TryPollyLogin(RegionEndpoint regionEndpoint) { DetailedLog.Info($"Logging into AWS region {regionEndpoint}"); Polly = new PollyClient(this.keyPair.AccessKey, this.keyPair.SecretKey, regionEndpoint, - this.lexiconManager, this.config); + this.lexiconManager, this.config, latencyTracker); var currentVoicePreset = this.config.GetCurrentVoicePreset(); // This should throw an exception if the login credentials were incorrect this.voices = Polly.GetVoicesForEngine(currentVoicePreset?.VoiceEngine ?? Engine.Neural); diff --git a/src/TextToTalk/Backends/Polly/PollyClient.cs b/src/TextToTalk/Backends/Polly/PollyClient.cs index 716a3a1..421d98c 100644 --- a/src/TextToTalk/Backends/Polly/PollyClient.cs +++ b/src/TextToTalk/Backends/Polly/PollyClient.cs @@ -2,9 +2,12 @@ using Amazon.Polly; using Amazon.Polly.Model; using Amazon.Runtime; +using Serilog; using System; using System.Collections.Generic; +using System.Diagnostics; using System.IO; +using System.Threading; using System.Threading.Tasks; using TextToTalk.Lexicons; @@ -13,15 +16,17 @@ namespace TextToTalk.Backends.Polly public class PollyClient : IDisposable { private readonly AmazonPollyClient client; - private readonly StreamSoundQueue soundQueue; + private readonly StreamingSoundQueue soundQueue; private readonly LexiconManager lexiconManager; private readonly PluginConfiguration config; - public PollyClient(string accessKey, string secretKey, RegionEndpoint region, LexiconManager lexiconManager, PluginConfiguration config) + public CancellationTokenSource? _TtsCts; + + public PollyClient(string accessKey, string secretKey, RegionEndpoint region, LexiconManager lexiconManager, PluginConfiguration config, LatencyTracker latencyTracker) { var credentials = new BasicAWSCredentials(accessKey, secretKey); this.client = new AmazonPollyClient(credentials, region); - this.soundQueue = new StreamSoundQueue(config); + this.soundQueue = new StreamingSoundQueue(config, latencyTracker); this.lexiconManager = lexiconManager; } @@ -53,6 +58,13 @@ public TextSource GetCurrentlySpokenTextSource() public async Task Say(Engine engine, VoiceId voice, string? amazonDomainName, int sampleRate, int playbackRate, float volume, TextSource source, string text) { + long methodStart = Stopwatch.GetTimestamp(); + _TtsCts?.Cancel(); + _TtsCts?.Dispose(); + + _TtsCts = new CancellationTokenSource(); + var ct = _TtsCts.Token; + if (!string.IsNullOrEmpty(amazonDomainName)) { text = $"{text}"; @@ -70,34 +82,46 @@ public async Task Say(Engine engine, VoiceId voice, string? amazonDomainName, in SampleRate = sampleRate.ToString(), TextType = TextType.Ssml, }; - - SynthesizeSpeechResponse res; + bool isFirstChunk = true; try { - res = await this.client.SynthesizeSpeechAsync(req); + var res = await this.client.SynthesizeSpeechAsync(req, ct); + + long? timestampToPass = isFirstChunk ? methodStart : null; + this.soundQueue.EnqueueSound(res.AudioStream, source, volume, StreamFormat.Mp3, null, timestampToPass); + isFirstChunk = false; + } + catch (OperationCanceledException) + { + // Silently ignore cancellations } catch (Exception e) { DetailedLog.Error(e, "Synthesis request failed in {0}.", nameof(PollyClient)); - return; } - var responseStream = new MemoryStream(); - await res.AudioStream.CopyToAsync(responseStream); - responseStream.Seek(0, SeekOrigin.Begin); - - this.soundQueue.EnqueueSound(responseStream, source, StreamFormat.Mp3, volume); } public Task CancelAllSounds() { this.soundQueue.CancelAllSounds(); + if (this._TtsCts != null) + { + this._TtsCts.Cancel(); + } + this.soundQueue.StopHardware(); return Task.CompletedTask; } public Task CancelFromSource(TextSource source) { this.soundQueue.CancelFromSource(source); + this.soundQueue.CancelAllSounds(); + if (this._TtsCts != null) + { + this._TtsCts.Cancel(); + } + this.soundQueue.StopHardware(); return Task.CompletedTask; } diff --git a/src/TextToTalk/Backends/SoundQueue.cs b/src/TextToTalk/Backends/SoundQueue.cs index 387ca2e..3046f58 100644 --- a/src/TextToTalk/Backends/SoundQueue.cs +++ b/src/TextToTalk/Backends/SoundQueue.cs @@ -1,4 +1,5 @@ -using System; +using Serilog; +using System; using System.Collections.Generic; using System.Linq; using System.Threading; diff --git a/src/TextToTalk/Backends/StreamFormat.cs b/src/TextToTalk/Backends/StreamFormat.cs index 2284855..d7ae8fc 100644 --- a/src/TextToTalk/Backends/StreamFormat.cs +++ b/src/TextToTalk/Backends/StreamFormat.cs @@ -4,5 +4,9 @@ public enum StreamFormat { Mp3, Wave, + Wave8K, + Wave16K, + Wave22K, Raw, + System, } \ No newline at end of file diff --git a/src/TextToTalk/Backends/StreamSoundQueue.cs b/src/TextToTalk/Backends/StreamSoundQueue.cs index e189499..3755c96 100644 --- a/src/TextToTalk/Backends/StreamSoundQueue.cs +++ b/src/TextToTalk/Backends/StreamSoundQueue.cs @@ -1,5 +1,6 @@ using NAudio.Wave; using NAudio.Wave.SampleProviders; +using Serilog; using System; using System.IO; using System.Threading; @@ -32,6 +33,7 @@ protected override void OnSoundLoop(StreamSoundQueueItem nextItem) // Play the sound lock (this.soundLock) { + Log.Information("Playing"); this.soundOut = new DirectSoundOut(playbackDeviceId); this.soundOut.PlaybackStopped += (_, _) => { this.speechCompleted.Set(); }; this.soundOut.Init(volumeSampleProvider); diff --git a/src/TextToTalk/Backends/StreamSoundQueueItem.cs b/src/TextToTalk/Backends/StreamSoundQueueItem.cs index 3ae5826..089c949 100644 --- a/src/TextToTalk/Backends/StreamSoundQueueItem.cs +++ b/src/TextToTalk/Backends/StreamSoundQueueItem.cs @@ -1,4 +1,7 @@ using System.IO; +using System.Net.Http; +using System.Diagnostics; +using static TextToTalk.Backends.System.SystemSoundQueue; namespace TextToTalk.Backends { @@ -10,14 +13,46 @@ public class StreamSoundQueueItem : SoundQueueItem public StreamFormat Format { get; init; } + protected override void Dispose(bool disposing) { if (disposing) { - Data.Dispose(); + try + { + Data.Dispose(); + } + catch { } + + base.Dispose(disposing); } + } + + public class StreamingSoundQueueItem : SoundQueueItem + { + public Stream Data { get; init; } + + public float Volume { get; init; } - base.Dispose(disposing); + public StreamFormat Format { get; init; } + public HttpResponseMessage? Response { get; set; } + public bool Aborted { get; set; } + + public long? StartTime { get; set; } // Use GetTimestamp() value + + protected override void Dispose(bool disposing) + { + if (disposing) + { + try + { + Data.Dispose(); + } + catch { } + + base.Dispose(disposing); + } + } } } } \ No newline at end of file diff --git a/src/TextToTalk/Backends/StreamingSoundQueue.cs b/src/TextToTalk/Backends/StreamingSoundQueue.cs new file mode 100644 index 0000000..a3ee4d8 --- /dev/null +++ b/src/TextToTalk/Backends/StreamingSoundQueue.cs @@ -0,0 +1,353 @@ +using Google.Protobuf.WellKnownTypes; +using NAudio.CoreAudioApi; +using NAudio.Wave; +using NAudio.Wave.SampleProviders; +using Serilog; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Net.Http; +using System.Net.Sockets; +using System.Speech.Synthesis; +using System.Threading; +using System.Threading.Tasks; +using TextToTalk.Backends.System; +using TextToTalk.Lexicons; +using static TextToTalk.Backends.StreamSoundQueueItem; + +namespace TextToTalk.Backends +{ + + public class StreamingSoundQueue(PluginConfiguration config, LatencyTracker latencyTracker) : SoundQueue + { + // WASAPI Hardware Members + private WasapiOut? soundOut; + private BufferedWaveProvider? bufferedProvider; + private VolumeSampleProvider? volumeProvider; + private readonly object soundLock = new(); + + // 1. Unified Audio Configuration + private static readonly WaveFormat Wave8k = new(8000, 16, 1); + private static readonly WaveFormat Wave16k = new(16000, 16, 1); + private static readonly WaveFormat Wave22k = new(22050, 16, 1); + private static readonly WaveFormat Wave = new(24000, 16, 1); + + private bool _isDisposed; + public CancellationTokenSource? _ttsCts; + + private LatencyTracker latencyTracker = latencyTracker; + + public void EnqueueSound(Stream data, TextSource source, float volume, StreamFormat format, HttpResponseMessage? response, long? timeStamp) + { + AddQueueItem(new StreamingSoundQueueItem + { + Data = data, + Source = source, + Volume = volume, + Format = format, + Response = response, + StartTime = timeStamp, + }); + } + + protected override void OnSoundLoop(StreamingSoundQueueItem nextItem) + { + // 1. Handle Seekable vs Network Streams + if (nextItem.Data.CanSeek) + { + nextItem.Data.Position = 0; + } + + // 2. Branch logic based on format (Encoded vs Raw) + if (nextItem.Format == StreamFormat.Mp3) + { + ProcessMp3Stream(nextItem); + } + + else + { + ProcessRawPcmStream(nextItem); + } + } + + private void ProcessMp3Stream(StreamingSoundQueueItem nextItem) + { + IMp3FrameDecompressor decompressor = null; + try + { + // Wrap the network stream to support forward-only Position tracking + // and prevent partial-read exceptions in LoadFromStream. + using var readFullyStream = new ReadFullyStream(nextItem.Data); + while (true) + { + + Mp3Frame frame; + frame = Mp3Frame.LoadFromStream(readFullyStream); + + + if (frame == null) break; + + if (decompressor == null) + { + WaveFormat mp3Format = new Mp3WaveFormat(frame.SampleRate, + frame.ChannelMode == ChannelMode.Mono ? 1 : 2, + frame.FrameLength, frame.BitRate); + + decompressor = new AcmMp3FrameDecompressor(mp3Format); + + lock (this.soundLock) + { + EnsureHardwareInitialized(decompressor.OutputFormat); + } + } + + byte[] decompressedBuffer = new byte[16384 * 2]; + int decompressedBytes = decompressor.DecompressFrame(frame, decompressedBuffer, 0); + ApplyVolumeToPcmBuffer(decompressedBuffer, decompressedBytes, nextItem.Volume); + + if (decompressedBytes > 0) + { + lock (this.soundLock) + { + if (this.bufferedProvider != null && this.soundOut != null) + { + this.bufferedProvider.AddSamples(decompressedBuffer, 0, decompressedBytes); + if (this.bufferedProvider.BufferedBytes > 4096 && + this.soundOut.PlaybackState != PlaybackState.Playing) + { + if (nextItem.StartTime.HasValue) + { + var elapsed = Stopwatch.GetElapsedTime(nextItem.StartTime.Value); + latencyTracker.AddLatency(elapsed.TotalMilliseconds); + Log.Debug("Total Latency (Say -> PlayMp3): {Ms}", elapsed.TotalMilliseconds); + } + this.soundOut.Play(); + } + } + + + } + } + } + } + catch (Exception ex) + { + Log.Error(ex, "Error during real-time ACM decompression"); + } + finally + { + decompressor?.Dispose(); + nextItem.Data.Dispose(); + } + } + + private void ProcessRawPcmStream(StreamingSoundQueueItem nextItem) + { + WaveFormat chunkFormat = nextItem.Format switch + { + StreamFormat.Wave => Wave, + StreamFormat.Wave8K => Wave8k, + StreamFormat.Wave16K => Wave16k, + StreamFormat.Wave22K => Wave22k, + _ => throw new NotSupportedException($"Format {nextItem.Format} requires a decompressor."), + }; + + lock (this.soundLock) + { + EnsureHardwareInitialized(chunkFormat); + byte[] chunkBuffer = new byte[16384]; + int bytesRead; + bool latencyLogged = false; + + + + while ((bytesRead = nextItem.Data.Read(chunkBuffer, 0, chunkBuffer.Length)) > 0) + { + ApplyVolumeToPcmBuffer(chunkBuffer, bytesRead, nextItem.Volume); + this.bufferedProvider.AddSamples(chunkBuffer, 0, bytesRead); + + if (this.bufferedProvider.BufferedBytes > 16384 && this.soundOut.PlaybackState != PlaybackState.Playing) + { + if (nextItem.StartTime.HasValue) + { + var elapsed = Stopwatch.GetElapsedTime(nextItem.StartTime.Value); + latencyTracker.AddLatency(elapsed.TotalMilliseconds); + Log.Debug("Total Latency (Say -> PlayPCM): {Ms}", elapsed.TotalMilliseconds); + } + + this.soundOut.Play(); + latencyLogged = true; + } + } + } + nextItem.Data.Dispose(); + } + + + private void EnsureHardwareInitialized(WaveFormat format) + { + if (this.soundOut == null || !this.bufferedProvider.WaveFormat.Equals(format)) + { + this.StopHardware(); + this.bufferedProvider = new BufferedWaveProvider(format) { ReadFully = true }; + this.bufferedProvider.BufferDuration = TimeSpan.FromSeconds(30); + + var mmDevice = GetWasapiDeviceFromGuid(config.SelectedAudioDeviceGuid); + this.soundOut = new WasapiOut(mmDevice, AudioClientShareMode.Shared, false, 50); + this.soundOut.Init(this.bufferedProvider); + } + } + + private MMDevice GetWasapiDeviceFromGuid(Guid targetGuid) + { + using var enumerator = new MMDeviceEnumerator(); + var devices = enumerator.EnumerateAudioEndPoints(DataFlow.Render, DeviceState.Active); + foreach (var device in devices) + { + if (device.Properties.Contains(PropertyKeys.PKEY_AudioEndpoint_GUID)) + { + var guidString = device.Properties[PropertyKeys.PKEY_AudioEndpoint_GUID].Value as string; + if (Guid.TryParse(guidString, out var deviceGuid) && deviceGuid == targetGuid) + return device; + } + } + return enumerator.GetDefaultAudioEndpoint(DataFlow.Render, Role.Console); + } + protected override void OnSoundCancelled() + { + StopHardware(); + } + public override void CancelAllSounds() + { + StopHardware(); + base.CancelAllSounds(); + } + + public void StopHardware() + { + lock (this.soundLock) + { + if (this.soundOut != null) + { + this.soundOut.Stop(); + Thread.Sleep(10); + this.soundOut.Dispose(); + this.soundOut = null; + } + if (this.bufferedProvider != null) + { + this.bufferedProvider.ClearBuffer(); + this.bufferedProvider = null; + } + } + + } + protected override void Dispose(bool disposing) + { + if (_isDisposed) return; + _isDisposed = true; // Signal all loops to stop immediately + + if (disposing) + { + try + { + soundOut?.Stop(); + } + catch (Exception ex) + { + DetailedLog.Error(ex, "Error during early shutdown phase"); + } + + base.Dispose(disposing); // Clean up the queue thread + soundOut?.Dispose(); + } + } + private void ApplyVolumeToPcmBuffer(byte[] buffer, int bytesRead, float volume) + { + // Skip calculation if volume is 100% to save CPU + if (Math.Abs(volume - 1.0f) < 0.001f) return; + + for (int i = 0; i < bytesRead; i += 2) + { + // 1. Combine two bytes into one 16-bit signed integer (short) + short sample = (short)((buffer[i + 1] << 8) | buffer[i]); + + // 2. Scale the sample value by the volume float (0.0 to 1.0) + float scaledSample = sample * volume; + + // 3. Clamp the value to ensure it stays within 16-bit bounds to prevent "pops" + if (scaledSample > short.MaxValue) scaledSample = short.MaxValue; + if (scaledSample < short.MinValue) scaledSample = short.MinValue; + + short finalSample = (short)scaledSample; + + // 4. Split the 16-bit sample back into two bytes and store in the buffer + buffer[i] = (byte)(finalSample & 0xFF); + buffer[i + 1] = (byte)((finalSample >> 8) & 0xFF); + } + } + } + + public class ReadFullyStream : Stream + { + private readonly Stream sourceStream; + private long pos; + + public ReadFullyStream(Stream sourceStream) + { + this.sourceStream = sourceStream; + } + + public override int Read(byte[] buffer, int offset, int count) + { + int totalBytesRead = 0; + try + { + while (totalBytesRead < count) + { + int bytesRead = sourceStream.Read(buffer, offset + totalBytesRead, count - totalBytesRead); + if (bytesRead == 0) break; + totalBytesRead += bytesRead; + } + } + catch (Exception ex) when (ex.InnerException is SocketException se && se.NativeErrorCode == 10053) + { + // "An established connection was aborted by the software in your host machine" + // This happens when we cancel the TTS request. Return 0 to signal End of Stream. + return 0; + } + catch (IOException) + { + // General network interruption during skip + return 0; + } + + pos += totalBytesRead; + return totalBytesRead; + } + + // Required for the class to compile + public override void Write(byte[] buffer, int offset, int count) + => throw new NotSupportedException(); + + public override bool CanRead => true; + public override bool CanSeek => false; + public override bool CanWrite => false; + public override long Length => throw new NotSupportedException(); + + // We provide a getter for Position so Mp3Frame can track its progress, + // but the setter is not supported. + public override long Position + { + get => pos; + set => throw new NotSupportedException(); + } + + public override void Flush() { } + public override long Seek(long offset, SeekOrigin origin) => throw new NotSupportedException(); + public override void SetLength(long value) => throw new NotSupportedException(); + } + +} diff --git a/src/TextToTalk/Backends/System/SystemBackend.cs b/src/TextToTalk/Backends/System/SystemBackend.cs index 4133358..4e7244c 100644 --- a/src/TextToTalk/Backends/System/SystemBackend.cs +++ b/src/TextToTalk/Backends/System/SystemBackend.cs @@ -1,4 +1,6 @@ -using System; +using Serilog; +using System; +using System.Diagnostics; using System.Net.Http; using System.Threading; @@ -10,9 +12,10 @@ public class SystemBackend : VoiceBackend private readonly SystemBackendUI ui; private readonly SystemSoundQueue soundQueue; private readonly IDisposable voiceExceptions; + private readonly LatencyTracker latencyTracker; - public SystemBackend(PluginConfiguration config, HttpClient http) + public SystemBackend(PluginConfiguration config, HttpClient http, LatencyTracker latencyTracker) { var lexiconManager = new DalamudLexiconManager(); LexiconUtils.LoadFromConfigSystem(lexiconManager, config); @@ -20,8 +23,9 @@ public SystemBackend(PluginConfiguration config, HttpClient http) this.uiModel = new SystemBackendUIModel(); this.ui = new SystemBackendUI(this.uiModel, config, lexiconManager, http, this); - this.soundQueue = new SystemSoundQueue(lexiconManager, config); + this.soundQueue = new SystemSoundQueue(lexiconManager, config, latencyTracker); this.voiceExceptions = this.uiModel.SubscribeToVoiceExceptions(this.soundQueue.SelectVoiceFailed); + this.latencyTracker = latencyTracker; } public override void DrawStyles(IConfigUIDelegates helpers) @@ -30,7 +34,9 @@ public override void DrawStyles(IConfigUIDelegates helpers) } public override void Say(SayRequest request) { - this.soundQueue.EnqueueSound(request.Voice, request.Source, request.Text); + long methodStart = Stopwatch.GetTimestamp(); + long? timestampToPass = methodStart; + this.soundQueue.EnqueueSound(request.Voice, request.Source, request.Text, timestampToPass); } public override void CancelAllSpeech() @@ -59,6 +65,7 @@ protected override void Dispose(bool disposing) { this.voiceExceptions.Dispose(); this.soundQueue.Dispose(); + } } } diff --git a/src/TextToTalk/Backends/System/SystemSoundQueue.cs b/src/TextToTalk/Backends/System/SystemSoundQueue.cs index f81bd83..092fea9 100644 --- a/src/TextToTalk/Backends/System/SystemSoundQueue.cs +++ b/src/TextToTalk/Backends/System/SystemSoundQueue.cs @@ -1,5 +1,11 @@ -using R3; +using NAudio.CoreAudioApi; +using NAudio.Wave; +using NAudio.Wave.SampleProviders; +using R3; +using Serilog; using System; +using System.Collections.Generic; +using System.Diagnostics; using System.IO; using System.Speech.Synthesis; using System.Threading; @@ -10,125 +16,243 @@ namespace TextToTalk.Backends.System { public class SystemSoundQueue : SoundQueue { - private MemoryStream stream; - private readonly SpeechSynthesizer speechSynthesizer; - private readonly LexiconManager lexiconManager; - private readonly StreamSoundQueue streamSoundQueue; - private readonly SystemBackend backend; - private readonly PluginConfiguration config; - private int soundLock; - private readonly SemaphoreSlim deviceLock = new SemaphoreSlim(1, 1); + private WasapiOut? soundOut; + private BufferedWaveProvider? bufferedProvider; + private VolumeSampleProvider? volumeProvider; + private readonly object soundLock = new(); - public Observable SelectVoiceFailed => selectVoiceFailed; - private readonly Subject selectVoiceFailed; - private bool isSynthesizing = false; + private static readonly WaveFormat SystemFormat = new(22050, 16, 1); + private readonly SpeechSynthesizer _speechSynthesizer; + private readonly LexiconManager _lexiconManager; + private readonly PluginConfiguration _config; + private readonly LatencyTracker _latencyTracker; - public async void ASyncSpeak(SpeechSynthesizer synth, string textToSpeak) - { - await Task.Run(() => synth.SpeakSsml(textToSpeak)); - } + private readonly Subject _selectVoiceFailed = new(); + public Observable SelectVoiceFailed => _selectVoiceFailed; + private bool _isDisposed; - public SystemSoundQueue(LexiconManager lexiconManager, PluginConfiguration config) + private readonly Dictionary _synthPool = new(); + + private SpeechSynthesizer GetSynthesizerForVoice(string voiceName) { - this.streamSoundQueue = new StreamSoundQueue(config); - this.lexiconManager = lexiconManager; - this.speechSynthesizer = new SpeechSynthesizer(); - this.selectVoiceFailed = new Subject(); + if (!_synthPool.TryGetValue(voiceName, out var synth)) + { + synth = new SpeechSynthesizer(); + synth.SelectVoice(voiceName); + // Pre-link the bridge for this specific synth + _synthPool[voiceName] = synth; + } + return synth; } - - public void EnqueueSound(VoicePreset preset, TextSource source, string text) + public void EnqueueSound(VoicePreset preset, TextSource source, string text, long? timeStamp) { AddQueueItem(new SystemSoundQueueItem { Preset = preset, - Text = text, Source = source, + Text = text, + StartTime = timeStamp, }); } - protected override async void OnSoundLoop(SystemSoundQueueItem nextItem) + public SystemSoundQueue(LexiconManager lexiconManager, PluginConfiguration config, LatencyTracker latencyTracker) { - if (nextItem.Preset is not SystemVoicePreset systemVoicePreset) + _lexiconManager = lexiconManager; + _config = config; + _speechSynthesizer = new SpeechSynthesizer(); + _latencyTracker = latencyTracker; + } + + + + private MMDevice GetWasapiDeviceFromGuid(Guid targetGuid) + { + using var enumerator = new MMDeviceEnumerator(); + var devices = enumerator.EnumerateAudioEndPoints(DataFlow.Render, DeviceState.Active); + foreach (var device in devices) { - throw new InvalidOperationException("Invalid voice preset provided."); + if (device.Properties.Contains(PropertyKeys.PKEY_AudioEndpoint_GUID)) + { + var guidString = device.Properties[PropertyKeys.PKEY_AudioEndpoint_GUID].Value as string; + if (Guid.TryParse(guidString, out var deviceGuid) && deviceGuid == targetGuid) + return device; + } } + return enumerator.GetDefaultAudioEndpoint(DataFlow.Render, Role.Console); + } - try + protected override void OnSoundLoop(SystemSoundQueueItem nextItem) + { + if (nextItem.Preset is not SystemVoicePreset preset || nextItem.Aborted) return; + + lock (this.soundLock) { - this.speechSynthesizer.UseVoicePreset(nextItem.Preset); + if (this.soundOut == null) + { + var mmDevice = GetWasapiDeviceFromGuid(_config.SelectedAudioDeviceGuid); + + this.bufferedProvider = new BufferedWaveProvider(new WaveFormat(22050, 16, 1)) + { + ReadFully = true, + BufferDuration = TimeSpan.FromSeconds(30) + }; + this.soundOut = new WasapiOut(mmDevice, AudioClientShareMode.Shared, false, 50); + this.soundOut.Init(this.bufferedProvider); + } } - catch (SelectVoiceFailedException e) + + if (!_synthPool.TryGetValue(preset.VoiceName, out var synth)) { - DetailedLog.Error(e, "Failed to select voice {0}", systemVoicePreset.VoiceName ?? ""); - this.selectVoiceFailed.OnNext(e); + synth = new SpeechSynthesizer(); + synth.SelectVoice(preset.VoiceName); + _synthPool[preset.VoiceName] = synth; } - var ssml = this.lexiconManager.MakeSsml(nextItem.Text, - langCode: this.speechSynthesizer.Voice.Culture.IetfLanguageTag); - DetailedLog.Verbose(ssml); + synth.Volume = preset.Volume; + synth.Rate = preset.Rate; - try + if (_speechSynthesizer.Voice.Name != preset.VoiceName) { - isSynthesizing = true; + _speechSynthesizer.SelectVoice(preset.VoiceName); + } + _speechSynthesizer.Volume = preset.Volume; + _speechSynthesizer.Rate = preset.Rate; - await deviceLock.WaitAsync(); + var ssml = _lexiconManager.MakeSsml(nextItem.Text, langCode: _speechSynthesizer.Voice.Culture.IetfLanguageTag); - this.stream = new MemoryStream(); - this.speechSynthesizer.SetOutputToWaveStream(this.stream); + // Start Synthesis in Background (Feeding the buffer via bridge) + using var bridge = new SynthesisBridgeStream(this.bufferedProvider!); + _speechSynthesizer.SetOutputToWaveStream(bridge); - await Task.Run(() => this.speechSynthesizer.SpeakSsml(ssml)); + var synthPrompt = _speechSynthesizer.SpeakSsmlAsync(ssml); - } - catch (OperationCanceledException) + while (!nextItem.Aborted && (!synthPrompt.IsCompleted || this.bufferedProvider?.BufferedBytes > 44)) { - + if (this.bufferedProvider?.BufferedBytes > 512) // Pre-roll threshold + { + if (this.soundOut?.PlaybackState != PlaybackState.Playing) + { + if (nextItem.StartTime.HasValue) + { + var elapsed = Stopwatch.GetElapsedTime(nextItem.StartTime.Value); + _latencyTracker.AddLatency(elapsed.TotalMilliseconds); + Log.Debug("Total Latency (Say -> Play): {Ms}ms", elapsed.TotalMilliseconds); + } + this.soundOut?.Play(); + } + } } - finally + this.StopHardware(); + } + + // Custom Stream to pipe synthesizer output directly to NAudio buffer + private class SynthesisBridgeStream : Stream + { + private readonly BufferedWaveProvider _target; + private int _bytesToSkip = 44; + private bool _headerSkipped = false; + private long _position = 0; + + public SynthesisBridgeStream(BufferedWaveProvider target) => _target = target; + + public override void Write(byte[] buffer, int offset, int count) { - isSynthesizing = false; - deviceLock.Release(); + if (_bytesToSkip > 0) + { + int skipNow = Math.Min(count, _bytesToSkip); + _bytesToSkip -= skipNow; + offset += skipNow; + count -= skipNow; + } + + if (count > 0) + { + _target.AddSamples(buffer, offset, count); + } } - this.stream.Seek(0, SeekOrigin.Begin); - this.streamSoundQueue.EnqueueSound(stream, nextItem.Source, StreamFormat.Wave, 1f); - } + public override bool CanRead => false; + public override bool CanSeek => true; + public override bool CanWrite => true; + public override long Length => _position; + public override long Position { get => _position; set => _position = value; } - public override void CancelAllSounds() - { - base.CancelAllSounds(); - this.streamSoundQueue.CancelAllSounds(); + public override long Seek(long offset, SeekOrigin origin) => _position; // Dummy seek + public override void SetLength(long value) { } + public override void Flush() { } + public override int Read(byte[] buffer, int offset, int count) => 0; } - public override void CancelFromSource(TextSource source) + protected override void OnSoundCancelled() { - base.CancelFromSource(source); - this.streamSoundQueue.CancelFromSource(source); - } + GetCurrentItem()?.Cancel(); + _speechSynthesizer.SpeakAsyncCancelAll(); - protected override void OnSoundCancelled() + StopHardware(); + } + + public override void CancelAllSounds() { - try + if (_isDisposed) return; + + try { - this.speechSynthesizer.SetOutputToNull(); + _speechSynthesizer?.SpeakAsyncCancelAll(); } + catch (ObjectDisposedException) { } - catch (ObjectDisposedException) - { + StopHardware(); + base.CancelAllSounds(); + } + + private void StopHardware() + { + lock (this.soundLock) + { + if (this.soundOut != null) + { + this.soundOut.Stop(); + Thread.Sleep(10); + this.soundOut.Dispose(); + this.soundOut = null; + } + if (this.bufferedProvider != null) + { + this.bufferedProvider.ClearBuffer(); + this.bufferedProvider = null; + } } } protected override void Dispose(bool disposing) { + if (_isDisposed) return; + _isDisposed = true; + if (disposing) { - this.speechSynthesizer.Dispose(); - } + try + { + soundOut?.Stop(); + + _speechSynthesizer?.SpeakAsyncCancelAll(); + _speechSynthesizer?.SetOutputToNull(); - base.Dispose(disposing); + } + catch (Exception ex) + { + DetailedLog.Error(ex, "Error during early shutdown phase"); + } + + base.Dispose(disposing); + + _speechSynthesizer?.Dispose(); + soundOut?.Dispose(); + } } } } \ No newline at end of file diff --git a/src/TextToTalk/Backends/System/SystemSoundQueueItem.cs b/src/TextToTalk/Backends/System/SystemSoundQueueItem.cs index ea52962..34bf0ed 100644 --- a/src/TextToTalk/Backends/System/SystemSoundQueueItem.cs +++ b/src/TextToTalk/Backends/System/SystemSoundQueueItem.cs @@ -5,5 +5,14 @@ public class SystemSoundQueueItem : SoundQueueItem public VoicePreset Preset { get; set; } public string Text { get; set; } + + public bool Aborted { get; private set; } + + public long? StartTime { get; set; } // Use GetTimestamp() value + + internal void Cancel() + { + Aborted = true; + } } } \ No newline at end of file diff --git a/src/TextToTalk/Backends/TTSBackend.cs b/src/TextToTalk/Backends/TTSBackend.cs index e7cab46..f963729 100644 --- a/src/TextToTalk/Backends/TTSBackend.cs +++ b/src/TextToTalk/Backends/TTSBackend.cs @@ -1,5 +1,6 @@ using System; using TextToTalk.Backends.Kokoro; +using TextToTalk.Backends.Piper; namespace TextToTalk.Backends { @@ -14,6 +15,7 @@ public enum TTSBackend : long OpenAi, GoogleCloud, Kokoro, + Piper, } public static class TTSBackendExtensions @@ -32,6 +34,8 @@ public static string GetFormattedName(this TTSBackend backend, PluginConfigurati TTSBackend.GoogleCloud => "Google Cloud", TTSBackend.Kokoro when config != null && KokoroBackend.IsModelFileDownloaded(config) => "Kokoro", TTSBackend.Kokoro => "Kokoro (169MB download required)", + TTSBackend.Piper when config != null && PiperBackend.IsModelFileDownloaded(config) => "Piper", + TTSBackend.Piper => "Piper (download required)", _ => throw new ArgumentOutOfRangeException(nameof(backend)), }; } @@ -49,6 +53,7 @@ public static bool AreLexiconsEnabled(this TTSBackend backend) TTSBackend.OpenAi => false, TTSBackend.GoogleCloud => false, TTSBackend.Kokoro => false, + TTSBackend.Piper => false, _ => throw new ArgumentOutOfRangeException(nameof(backend)), }; } diff --git a/src/TextToTalk/Backends/Uberduck/UberduckBackend.cs b/src/TextToTalk/Backends/Uberduck/UberduckBackend.cs index 6cdda83..7aa097d 100644 --- a/src/TextToTalk/Backends/Uberduck/UberduckBackend.cs +++ b/src/TextToTalk/Backends/Uberduck/UberduckBackend.cs @@ -11,19 +11,21 @@ namespace TextToTalk.Backends.Uberduck; /// public class UberduckBackend : VoiceBackend { - private readonly StreamSoundQueue soundQueue; + private readonly StreamingSoundQueue soundQueue; private readonly UberduckBackendUI ui; private readonly UberduckClient? uberduck; + private readonly LatencyTracker latencyTracker; - public UberduckBackend(PluginConfiguration config, HttpClient http) + public UberduckBackend(PluginConfiguration config, HttpClient http, LatencyTracker latencyTracker) { TitleBarColor = ImGui.ColorConvertU32ToFloat4(0xFFDE7312); - this.soundQueue = new StreamSoundQueue(config); + this.soundQueue = new StreamingSoundQueue(config, latencyTracker); this.uberduck = new UberduckClient(this.soundQueue, http); - var voices = this.uberduck.GetVoices().GetAwaiter().GetResult(); + var voices = this.uberduck.UpdateVoices().GetAwaiter().GetResult(); this.ui = new UberduckBackendUI(config, this.uberduck, () => voices, this); + this.latencyTracker = latencyTracker; } public override void DrawStyles(IConfigUIDelegates helpers) @@ -67,11 +69,21 @@ await this.uberduck.Say(uberduckVoicePreset.VoiceName, uberduckVoicePreset.Playb public override void CancelAllSpeech() { + if (uberduck._ttsCts != null) + { + uberduck._ttsCts.Cancel(); + } + this.soundQueue.StopHardware(); this.soundQueue.CancelAllSounds(); } public override void CancelSay(TextSource source) { + if (uberduck._ttsCts != null) + { + uberduck._ttsCts.Cancel(); + } + this.soundQueue.StopHardware(); this.soundQueue.CancelFromSource(source); } diff --git a/src/TextToTalk/Backends/Uberduck/UberduckBackendUI.cs b/src/TextToTalk/Backends/Uberduck/UberduckBackendUI.cs index 0bb5ab3..d38f293 100644 --- a/src/TextToTalk/Backends/Uberduck/UberduckBackendUI.cs +++ b/src/TextToTalk/Backends/Uberduck/UberduckBackendUI.cs @@ -29,8 +29,7 @@ public UberduckBackendUI(PluginConfiguration config, UberduckClient uberduck, var credentials = UberduckCredentialManager.LoadCredentials(); if (credentials != null) { - this.apiKey = credentials.UserName; - this.apiSecret = credentials.Password; + this.apiKey = credentials.Password; } this.uberduck.ApiKey = this.apiKey; @@ -52,11 +51,9 @@ public void DrawSettings(IConfigUIDelegates helpers) if (ImGui.Button($"Save and Login##{MemoizedId.Create()}")) { - var username = Whitespace.Replace(this.apiKey, ""); - var password = Whitespace.Replace(this.apiSecret, ""); - UberduckCredentialManager.SaveCredentials(username, password); - this.uberduck.ApiKey = username; - this.uberduck.ApiSecret = password; + var apiKey = Whitespace.Replace(this.apiKey, ""); + UberduckCredentialManager.SaveCredentials(apiKey); + this.uberduck.ApiKey = apiKey; } ImGui.SameLine(); @@ -119,21 +116,32 @@ public void DrawSettings(IConfigUIDelegates helpers) var voiceCategoriesFlat = voiceCategories.SelectMany(vc => vc.Value).ToList(); var voiceDisplayNames = voiceCategoriesFlat.Select(v => v.DisplayName).ToArray(); var voiceIds = voiceCategoriesFlat.Select(v => v.Name).ToArray(); + // 1. Get the index var voiceIndex = Array.IndexOf(voiceIds, currentVoicePreset.VoiceName); - if (ImGui.BeginCombo($"Voice##{MemoizedId.Create()}", voiceDisplayNames[voiceIndex])) + + // 2. Validate the index and determine the text to show in the combo box + // If -1, we show a placeholder or the raw name instead of crashing + string previewValue = (voiceIndex >= 0 && voiceIndex < voiceDisplayNames.Length) + ? voiceDisplayNames[voiceIndex] + : "Select a voice..."; // Fallback text + + if (ImGui.BeginCombo($"Voice##{MemoizedId.Create()}", previewValue)) { foreach (var (category, voices) in voiceCategories) { ImGui.Selectable(category, false, ImGuiSelectableFlags.Disabled); foreach (var voice in voices) { - if (ImGui.Selectable($" {voice.DisplayName}")) + // Highlight the currently selected item + bool isSelected = voice.Name == currentVoicePreset.VoiceName; + + if (ImGui.Selectable($" {voice.DisplayName}##{voice.Name}", isSelected)) { currentVoicePreset.VoiceName = voice.Name; this.config.Save(); } - if (voice.Name == currentVoicePreset.VoiceName) + if (isSelected) { ImGui.SetItemDefaultFocus(); } diff --git a/src/TextToTalk/Backends/Uberduck/UberduckClient.cs b/src/TextToTalk/Backends/Uberduck/UberduckClient.cs index 23d2fb0..297b740 100644 --- a/src/TextToTalk/Backends/Uberduck/UberduckClient.cs +++ b/src/TextToTalk/Backends/Uberduck/UberduckClient.cs @@ -1,109 +1,96 @@ -using Newtonsoft.Json; +using Dalamud.Game.ClientState.Fates; +using Dalamud.Interface.Windowing; +using Newtonsoft.Json; +using Serilog; using System; using System.Collections.Generic; using System.Collections.Immutable; +using System.Diagnostics; using System.IO; using System.Linq; -using System.Net; using System.Net.Http; using System.Text; using System.Text.RegularExpressions; +using System.Threading; using System.Threading.Tasks; +using WindowsSystem = System.Net; namespace TextToTalk.Backends.Uberduck; public partial class UberduckClient { - private const string UrlBase = "https://api.uberduck.ai"; + private const string UrlBase = "https://api.uberduck.ai/v1"; private readonly HttpClient http; - private readonly StreamSoundQueue soundQueue; + private readonly StreamingSoundQueue soundQueue; + public CancellationTokenSource? _ttsCts; public string? ApiKey { private get; set; } public string? ApiSecret { private get; set; } + public IDictionary> CachedVoices { get; private set; } + = new Dictionary>(); + private IList Voices { get; set; } - public UberduckClient(StreamSoundQueue soundQueue, HttpClient http) + + public UberduckClient(StreamingSoundQueue soundQueue, HttpClient http) { this.http = http; this.soundQueue = soundQueue; + var credentials = UberduckCredentialManager.LoadCredentials(); + if (credentials != null) + { + this.ApiKey = credentials.Password; ; + } Voices = new List(); } - public async Task>> GetVoices() - { - if (Voices.Count == 0) await UpdateVoices(); - return Voices - .GroupBy(v => string.IsNullOrWhiteSpace(v.Category) ? "Uncategorized" : v.Category) - .ToImmutableSortedDictionary( - g => g.Key, - g => (IList)g.OrderByDescending(v => v.DisplayName).ToList()); - } - + // Uberduck TTS API call. They have moved to a versioned API so this section needed a pretty extensive re-work public async Task Say(string voice, int playbackRate, float volume, TextSource source, string text) { - if (!IsAuthorizationSet()) - { - throw new UberduckMissingCredentialsException("No Uberduck authorization keys have been configured."); - } - ArgumentException.ThrowIfNullOrEmpty(voice); + long methodStart = Stopwatch.GetTimestamp(); + _ttsCts?.Cancel(); + _ttsCts = new CancellationTokenSource(); + var token = _ttsCts.Token; + var url = "https://api.uberduck.ai/v1/text-to-speech"; - var voiceModelUuid = IsUuid(voice) ? voice : await GetUuidForVoice(voice); - var args = new UberduckSpeechRequest + var payload = new { - Speech = text, - VoiceModelUuid = voiceModelUuid, + text = text, + voice = voice, + output_format = "wav" }; - // Make the request - using var content = new StringContent(JsonConvert.SerializeObject(args), Encoding.UTF8, "application/json"); - var res = await SendRequest("/speak", reqContent: content); - var uuid = res?.Uuid; - if (uuid is null) - { - DetailedLog.Warn("Got null UUID from Uberduck"); - return; - } - - DetailedLog.Debug($"Got request UUID {uuid} from Uberduck"); + using var request = new HttpRequestMessage(HttpMethod.Post, url); + request.Headers.Authorization = new WindowsSystem.Http.Headers.AuthenticationHeaderValue("Bearer", ApiKey); + request.Content = new StringContent(JsonConvert.SerializeObject(payload), Encoding.UTF8, "application/json"); - // Poll for the TTS result - await Task.Delay(20); + var response = await this.http.SendAsync(request, token); - var path = ""; - do + if (response.IsSuccessStatusCode) { - try - { - var status = await GetSpeechStatus(uuid); - if (status is not { FailedAt: null }) - { - DetailedLog.Warn($"TTS request {uuid} failed for an unknown reason"); - return; - } - - path = status.Path; - } - catch (UberduckFailedException e) when (e.StatusCode is HttpStatusCode.NotFound) - { - // ignored - } + var json = await response.Content.ReadAsStringAsync(token); + var result = JsonConvert.DeserializeObject(json); - await Task.Delay(100); - } while (string.IsNullOrEmpty(path)); + if (result?.AudioUrl != null && !token.IsCancellationRequested) + { - DetailedLog.Debug($"Got response for TTS request {uuid}"); + var audioBytes = await this.http.GetByteArrayAsync(result.AudioUrl); - // Copy the sound to a new buffer and enqueue it - var responseStream = await this.http.GetStreamAsync(new Uri(path)); - var waveStream = new MemoryStream(); - await responseStream.CopyToAsync(waveStream); - waveStream.Seek(0, SeekOrigin.Begin); + var waveStream = new MemoryStream(audioBytes); + long? timestampToPass = methodStart; - this.soundQueue.EnqueueSound(waveStream, source, StreamFormat.Wave, volume); + this.soundQueue.EnqueueSound(waveStream, source, volume, StreamFormat.Mp3, null, timestampToPass); + } + } + } + public class UberduckTtsResponse + { + [JsonProperty("audio_url")] + public string? AudioUrl { get; set; } } private Task GetSpeechStatus(string uuid) @@ -122,14 +109,45 @@ private static bool IsUuid(string voice) private async Task GetUuidForVoice(string voice) { if (Voices.Count == 0) await UpdateVoices(); - var voiceInfo = Voices.Single(v => v.Name == voice); + var voiceInfo = Voices.Single(v => v.Name == voice); return voiceInfo.VoiceModelUuid; } - private async Task UpdateVoices() + public async Task>> UpdateVoices() { - var voicesRes = await this.http.GetStringAsync(new Uri("https://api.uberduck.ai/voices?mode=tts-basic")); - Voices = JsonConvert.DeserializeObject>(voicesRes) ?? new List(); + Log.Information("Updating Voices..."); + if (IsAuthorizationSet()) + { + var request = new HttpRequestMessage(HttpMethod.Get, "https://api.uberduck.ai/v1/voices"); + AddAuthorization(request); + + var response = await this.http.SendAsync(request); + + if (response.IsSuccessStatusCode) + { + var json = await response.Content.ReadAsStringAsync(); + var result = JsonConvert.DeserializeObject(json); + + this.Voices = result?.Voices ?? new List(); + + this.CachedVoices = this.Voices + .OrderBy(v => v.DisplayName) + .GroupBy(v => v.Category ?? "Uncategorized") + .ToDictionary( + g => g.Key, + g => (IList)g.ToList() + ); + return this.CachedVoices; + } + else + { + Log.Information($"Response = {response.StatusCode}"); + } + } + else { Log.Information("Authorization not set, cannot update voices."); + } + + return new Dictionary>(); } private async Task SendRequest(string endpoint, string query = "", @@ -151,7 +169,7 @@ private async Task UpdateVoices() var res = await this.http.SendAsync(req); var resContent = await res.Content.ReadAsStringAsync(); - if (res.StatusCode is HttpStatusCode.Unauthorized or HttpStatusCode.Forbidden) + if (res.StatusCode is WindowsSystem.HttpStatusCode.Unauthorized or WindowsSystem.HttpStatusCode.Forbidden) { var detail = GetRequestFailureDetail(resContent); throw new UberduckUnauthorizedException(detail); @@ -183,15 +201,14 @@ private static string GetRequestFailureDetail(string resContent) private void AddAuthorization(HttpRequestMessage req) { - // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/WWW-Authenticate#basic_authentication - var raw = Encoding.UTF8.GetBytes($"{ApiKey}:{ApiSecret}"); - var encodedAuth = Convert.ToBase64String(raw); - req.Headers.Add("Authorization", $"Basic {encodedAuth}"); + req.Headers.Authorization = new global::System.Net.Http.Headers.AuthenticationHeaderValue("Bearer", ApiKey); } private bool IsAuthorizationSet() { - return ApiKey?.Length > 0 && ApiSecret?.Length > 0; + var resultbool = ApiKey?.Length > 0; + Log.Information($"Is Authorization Set? {resultbool}"); + return resultbool;// && ApiSecret?.Length > 0; } private class UberduckSpeechRequest diff --git a/src/TextToTalk/Backends/Uberduck/UberduckCredentialManager.cs b/src/TextToTalk/Backends/Uberduck/UberduckCredentialManager.cs index 23fb8e9..a274ecb 100644 --- a/src/TextToTalk/Backends/Uberduck/UberduckCredentialManager.cs +++ b/src/TextToTalk/Backends/Uberduck/UberduckCredentialManager.cs @@ -13,9 +13,9 @@ public static class UberduckCredentialManager return credentials; } - public static void SaveCredentials(string username, string password) + public static void SaveCredentials(string apikey)//, string password) { - var credentials = new NetworkCredential(username, password); + var credentials = new NetworkCredential("null", apikey);//, password); CredentialManager.SaveCredentials(CredentialsTarget, credentials); } diff --git a/src/TextToTalk/Backends/Uberduck/UberduckVoice.cs b/src/TextToTalk/Backends/Uberduck/UberduckVoice.cs index 7ca7824..b196b7a 100644 --- a/src/TextToTalk/Backends/Uberduck/UberduckVoice.cs +++ b/src/TextToTalk/Backends/Uberduck/UberduckVoice.cs @@ -1,14 +1,33 @@ using Newtonsoft.Json; +using System.Collections.Generic; namespace TextToTalk.Backends.Uberduck; public class UberduckVoice { - [JsonProperty("display_name")] public required string DisplayName { get; init; } + [JsonProperty("display_name")] + public required string DisplayName { get; init; } - [JsonProperty("name")] public required string Name { get; init; } + [JsonProperty("name")] + public required string Name { get; init; } - [JsonProperty("voicemodel_uuid")] public required string VoiceModelUuid { get; init; } + [JsonProperty("voicemodel_uuid")] + public required string VoiceModelUuid { get; init; } - [JsonProperty("category")] public required string? Category { get; init; } + [JsonProperty("category")] + public string? Category { get; init; } + + // New for 2026: Useful for filtering by locale + [JsonProperty("language")] + public string? Language { get; init; } + + // New for 2026: Contains traits like "professional", "narrative", etc. + [JsonProperty("tags")] + public List Tags { get; init; } = new(); +} + +public class UberduckVoiceResponse +{ + [JsonProperty("voices")] + public IList Voices { get; set; } } \ No newline at end of file diff --git a/src/TextToTalk/Backends/VoiceBackendManager.cs b/src/TextToTalk/Backends/VoiceBackendManager.cs index e539250..e115e5c 100644 --- a/src/TextToTalk/Backends/VoiceBackendManager.cs +++ b/src/TextToTalk/Backends/VoiceBackendManager.cs @@ -9,6 +9,7 @@ using TextToTalk.Backends.GoogleCloud; using TextToTalk.Backends.Kokoro; using TextToTalk.Backends.OpenAI; +using TextToTalk.Backends.Piper; using TextToTalk.Backends.Polly; using TextToTalk.Backends.System; using TextToTalk.Backends.Uberduck; @@ -23,17 +24,20 @@ public class VoiceBackendManager : VoiceBackend private readonly PluginConfiguration config; private readonly IUiBuilder uiBuilder; private readonly INotificationService notificationService; + private readonly LatencyTracker latencyTracker; public VoiceBackend? Backend { get; private set; } public bool BackendLoading { get; private set; } public VoiceBackendManager(PluginConfiguration config, HttpClient http, IUiBuilder uiBuilder, - INotificationService notificationService) + INotificationService notificationService, LatencyTracker tracker) { this.config = config; this.http = http; this.uiBuilder = uiBuilder; this.notificationService = notificationService; + this.latencyTracker = tracker; + SetBackend(this.config.Backend); } @@ -104,15 +108,16 @@ private VoiceBackend CreateBackendFor(TTSBackend backendKind) { return backendKind switch { - TTSBackend.System => new SystemBackend(this.config, this.http), - TTSBackend.Websocket => new WebsocketBackend(this.config, this.notificationService), - TTSBackend.AmazonPolly => new PollyBackend(this.config, this.http), - TTSBackend.Uberduck => new UberduckBackend(this.config, this.http), - TTSBackend.Azure => new AzureBackend(this.config, this.http), - TTSBackend.ElevenLabs => new ElevenLabsBackend(this.config, this.http, this.notificationService), - TTSBackend.OpenAi => new OpenAiBackend(this.config, this.http, this.notificationService), - TTSBackend.GoogleCloud => new GoogleCloudBackend(this.config), - TTSBackend.Kokoro => new KokoroBackend(this.config), + TTSBackend.System => new SystemBackend(this.config, this.http, this.latencyTracker), + TTSBackend.Websocket => new WebsocketBackend(this.config, this.notificationService, this.latencyTracker), + TTSBackend.AmazonPolly => new PollyBackend(this.config, this.http, this.latencyTracker), + TTSBackend.Uberduck => new UberduckBackend(this.config, this.http, this.latencyTracker), + TTSBackend.Azure => new AzureBackend(this.config, this.http, this.latencyTracker), + TTSBackend.ElevenLabs => new ElevenLabsBackend(this.config, this.http, this.notificationService, this.latencyTracker), + TTSBackend.OpenAi => new OpenAiBackend(this.config, this.http, this.notificationService, this.latencyTracker), + TTSBackend.GoogleCloud => new GoogleCloudBackend(this.config, this.latencyTracker), + TTSBackend.Kokoro => new KokoroBackend(this.config, this.latencyTracker), + TTSBackend.Piper => new PiperBackend(this.config, this.latencyTracker), _ => throw new ArgumentOutOfRangeException(nameof(backendKind)), }; } diff --git a/src/TextToTalk/Backends/Websocket/WebsocketBackend.cs b/src/TextToTalk/Backends/Websocket/WebsocketBackend.cs index 058e01a..e9be0d6 100644 --- a/src/TextToTalk/Backends/Websocket/WebsocketBackend.cs +++ b/src/TextToTalk/Backends/Websocket/WebsocketBackend.cs @@ -23,7 +23,9 @@ public class WebsocketBackend : VoiceBackend private bool dirtyConfig; private Exception? lastException; - public WebsocketBackend(PluginConfiguration config, INotificationService notificationService) + private readonly LatencyTracker latencyTracker; + + public WebsocketBackend(PluginConfiguration config, INotificationService notificationService, LatencyTracker latencyTracker) { this.config = config; @@ -40,6 +42,7 @@ public WebsocketBackend(PluginConfiguration config, INotificationService notific } this.wsServer.Start(); + this.latencyTracker = latencyTracker; } public override void DrawStyles(IConfigUIDelegates helpers) diff --git a/src/TextToTalk/CommandModules/MainCommandModule.cs b/src/TextToTalk/CommandModules/MainCommandModule.cs index dd516c9..91264bd 100644 --- a/src/TextToTalk/CommandModules/MainCommandModule.cs +++ b/src/TextToTalk/CommandModules/MainCommandModule.cs @@ -14,9 +14,10 @@ public class MainCommandModule : CommandModule private readonly ConfigurationWindow configurationWindow; private readonly IConfigUIDelegates configUIDelegates; private readonly VoiceStyles StylesWindow; + private readonly StatsWindow StatsWindow; public MainCommandModule(ICommandManager commandManager, IChatGui chat, PluginConfiguration config, - VoiceBackendManager backendManager, ConfigurationWindow configurationWindow, IConfigUIDelegates configUIDelegates, VoiceStyles StylesWindow) : base(commandManager) //ElevenLabsStylesWindow elevenLabsStylesWindow) + VoiceBackendManager backendManager, ConfigurationWindow configurationWindow, IConfigUIDelegates configUIDelegates, VoiceStyles StylesWindow, StatsWindow statsWindow) : base(commandManager) //ElevenLabsStylesWindow elevenLabsStylesWindow) { this.chat = chat; @@ -25,6 +26,7 @@ public MainCommandModule(ICommandManager commandManager, IChatGui chat, PluginCo this.configurationWindow = configurationWindow; this.configUIDelegates = configUIDelegates; this.StylesWindow = StylesWindow; + this.StatsWindow = statsWindow; AddCommand("/canceltts", CancelTts, "Cancel all queued TTS messages."); AddCommand("/toggletts", ToggleTts, "Toggle TextToTalk's text-to-speech."); @@ -32,6 +34,8 @@ public MainCommandModule(ICommandManager commandManager, IChatGui chat, PluginCo AddCommand("/enabletts", EnableTts, "Enable TextToTalk's text-to-speech."); AddCommand("/tttconfig", ToggleConfig, "Toggle TextToTalk's configuration window."); AddCommand("/tttstyles", ToggleStyles, "Toggle TextToTalk's styles window."); + AddCommand("/tttstats", ToggleStats, "Toggle TextToTalk's latency stats window."); + } public void CancelTts(string command = "", string args = "") @@ -70,4 +74,9 @@ public void ToggleStyles(string command = "", string args = "") { this.StylesWindow.Toggle(); } + + public void ToggleStats(string command = "", string args = "") + { + this.StatsWindow.Toggle(); + } } \ No newline at end of file diff --git a/src/TextToTalk/GameEnums/AdditionalChatType.cs b/src/TextToTalk/GameEnums/AdditionalChatType.cs index 76c992d..12b806d 100644 --- a/src/TextToTalk/GameEnums/AdditionalChatType.cs +++ b/src/TextToTalk/GameEnums/AdditionalChatType.cs @@ -10,7 +10,7 @@ public enum AdditionalChatType Gathering = 67, FCAnnouncement = 69, FCLogin = 70, - RetainerSale = 71, + //RetainerSale = 71, PartyFinderState = 72, ActionUsedOnYou = 2091, FailedActionUsedOnYou = 2218, diff --git a/src/TextToTalk/PluginConfiguration.cs b/src/TextToTalk/PluginConfiguration.cs index 7f1088e..0aa55a1 100644 --- a/src/TextToTalk/PluginConfiguration.cs +++ b/src/TextToTalk/PluginConfiguration.cs @@ -8,6 +8,7 @@ using System.IO; using System.Linq; using System.Net; +using System.Threading; using TextToTalk.Backends; using TextToTalk.Backends.System; using TextToTalk.Backends.Websocket; diff --git a/src/TextToTalk/TextProviders/ChatMessageHandler.cs b/src/TextToTalk/TextProviders/ChatMessageHandler.cs index f330161..1f05de0 100644 --- a/src/TextToTalk/TextProviders/ChatMessageHandler.cs +++ b/src/TextToTalk/TextProviders/ChatMessageHandler.cs @@ -127,12 +127,14 @@ private void ProcessChatMessage(ChatMessage chatMessage) if (!this.filters.OnlyMessagesFromYou(speaker?.Name.TextValue ?? sender.TextValue)) return; if (!this.filters.ShouldSayFromYou(speaker?.Name.TextValue ?? sender.TextValue)) return; - - OnTextEmit.Invoke(new ChatTextEmitEvent( - GetCleanSpeakerName(speaker, sender), - textValue, - speaker, - type)); + + else if (type == XivChatType.TellOutgoing && config.SkipMessagesFromYou == true) return; + + OnTextEmit.Invoke(new ChatTextEmitEvent( + GetCleanSpeakerName(speaker, sender), + textValue, + speaker, + type)); } private static SeString GetCleanSpeakerName(IGameObject? speaker, SeString sender) diff --git a/src/TextToTalk/TextToTalk.cs b/src/TextToTalk/TextToTalk.cs index 6aca486..657429d 100644 --- a/src/TextToTalk/TextToTalk.cs +++ b/src/TextToTalk/TextToTalk.cs @@ -23,6 +23,7 @@ using TextToTalk.Backends.GoogleCloud; using TextToTalk.Backends.Kokoro; using TextToTalk.Backends.OpenAI; +using TextToTalk.Backends.Piper; using TextToTalk.Backends.Polly; using TextToTalk.Backends.System; using TextToTalk.Backends.Uberduck; @@ -43,6 +44,8 @@ using static System.Net.Mime.MediaTypeNames; using GameObject = Dalamud.Game.ClientState.Objects.Types.IGameObject; +using Serilog; + namespace TextToTalk { public partial class TextToTalk : IDalamudPlugin @@ -89,6 +92,9 @@ public partial class TextToTalk : IDalamudPlugin private readonly IConfigUIDelegates configUIDelegates; private readonly VoiceStyles StylesWindow; + private readonly LatencyTracker tracker; + private readonly StatsWindow statsWindow; + public string Name => "TextToTalk"; @@ -117,6 +123,10 @@ public TextToTalk( this.framework = framework; this.data = data; + this.tracker = new LatencyTracker(); + this.statsWindow = new StatsWindow(this.tracker); + + CreateDatabasePath(); CreateEventLogDatabase(); this.database = new LiteDatabase(GetDatabasePath("TextToTalk.db")); @@ -138,7 +148,7 @@ public TextToTalk( var sharedState = new SharedState(); this.http = new HttpClient(); - this.backendManager = new VoiceBackendManager(this.config, this.http, pi.UiBuilder, this.notificationService); + this.backendManager = new VoiceBackendManager(this.config, this.http, pi.UiBuilder, this.notificationService, this.tracker); this.StylesWindow = new VoiceStyles(this.backendManager, this.configUIDelegates, this.config); this.playerService = new PlayerService(playerCollection, this.config.GetVoiceConfig().VoicePresets); this.npcService = new NpcService(npcCollection, this.config.GetVoiceConfig().VoicePresets); @@ -166,6 +176,7 @@ public TextToTalk( this.windows.AddWindow(this.configurationWindow); this.windows.AddWindow(channelPresetModificationWindow); this.windows.AddWindow(this.StylesWindow); + this.windows.AddWindow(this.statsWindow); var filters = new MessageHandlerFilters(sharedState, this.config, this.clientState); this.addonTalkHandler = @@ -184,10 +195,10 @@ public TextToTalk( this.commandModule = new MainCommandModule(commandManager, chat, this.config, this.backendManager, - this.configurationWindow, this.configUIDelegates, this.StylesWindow); + this.configurationWindow, this.configUIDelegates, this.StylesWindow, this.statsWindow); this.debugCommandModule = new DebugCommandModule(commandManager, chat, gui, framework); - + RegisterCallbacks(); var handleTextCancel = HandleTextCancel(); @@ -507,6 +518,7 @@ private static unsafe bool TryGetCharacter(GameObject? speaker, OpenAiBackend => GetVoiceForSpeaker(name, gender), GoogleCloudBackend => GetVoiceForSpeaker(name, gender), KokoroBackend => GetVoiceForSpeaker(name, gender), + PiperBackend => GetVoiceForSpeaker(name, gender), _ => throw new InvalidOperationException("Failed to get voice preset for backend."), }; } @@ -600,4 +612,45 @@ public void Dispose() #endregion } + public class LatencyTracker + { + private readonly List history = new(); + private readonly object historyLock = new(); + + + public double AverageLatency + { + get + { + lock (historyLock) + { + return history.Count == 0 ? 0 : history.Average(); + } + } + } + public float[] GetHistoryArray() + { + lock (historyLock) + { + return history.Select(d => (float)d).ToArray(); + } + } + + public void AddLatency(double ms) + { + lock (historyLock) + { + history.Add(ms); + if (history.Count > 100) history.RemoveAt(0); // Keep last 100 requests + } + } + + public void Clear() + { + lock (historyLock) + { + history.Clear(); + } + } + } } \ No newline at end of file diff --git a/src/TextToTalk/TextToTalk.csproj b/src/TextToTalk/TextToTalk.csproj index 6258ee0..c32a566 100644 --- a/src/TextToTalk/TextToTalk.csproj +++ b/src/TextToTalk/TextToTalk.csproj @@ -22,6 +22,11 @@ true + + + + + Always @@ -31,11 +36,13 @@ - + + + compile; build; native; contentfiles; analyzers; buildtransitive diff --git a/src/TextToTalk/UI/ConfigurationWindow.cs b/src/TextToTalk/UI/ConfigurationWindow.cs index 64fcf5e..7c59aa8 100644 --- a/src/TextToTalk/UI/ConfigurationWindow.cs +++ b/src/TextToTalk/UI/ConfigurationWindow.cs @@ -1,19 +1,20 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Numerics; -using System.Text; +using Dalamud.Bindings.ImGui; using Dalamud.Game.Text; using Dalamud.Interface; using Dalamud.Interface.Windowing; using Dalamud.Plugin.Services; -using Dalamud.Bindings.ImGui; using Lumina.Excel.Sheets; using R3; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Numerics; +using System.Text; using TextToTalk.Backends; using TextToTalk.Data.Model; using TextToTalk.GameEnums; using TextToTalk.Services; +using TextToTalk.UI.Windows; namespace TextToTalk.UI { @@ -295,6 +296,12 @@ private void DrawSynthesizerSettings() // I'm sure there's a cleaner method to c this.config); Components.Tooltip( "Removes \"stuttering\" from NPC dialogue such as \"H-hello, nice to m-meet you...\""); + + + if (ImGui.Button($"Show Latency Data##{MemoizedId.Create()}")) + { + StatsWindow.Instance?.ToggleStats(); + } } } diff --git a/src/TextToTalk/UI/Windows/StatsWindow.cs b/src/TextToTalk/UI/Windows/StatsWindow.cs new file mode 100644 index 0000000..c729204 --- /dev/null +++ b/src/TextToTalk/UI/Windows/StatsWindow.cs @@ -0,0 +1,55 @@ +using Dalamud.Bindings.ImGui; +using Dalamud.Interface.Windowing; +using System; +using System.Linq; +using TextToTalk; +using TextToTalk.Backends; +using TextToTalk.UI.Windows; + +public class StatsWindow : Window +{ + private float[] dataArray = Array.Empty(); + private DateTime lastUpdateTime = DateTime.MinValue; + private readonly object updateLock = new(); + public static StatsWindow? Instance { get; private set; } + + private readonly LatencyTracker tracker; + public bool IsVisible = false; + + public StatsWindow(TextToTalk.LatencyTracker tracker) : base("TTS Statistics") + { + this.tracker = tracker; + Instance = this; + } + + public void ToggleStats() + { + this.IsOpen = !this.IsOpen; + } + + public override void Draw() + { + float[] fullDataArray = tracker.GetHistoryArray(); + + ImGui.Text($"Average Latency: {tracker.AverageLatency:F2} ms"); + ImGui.SameLine(); + if (ImGui.Button("Clear History")) + { + tracker.Clear(); + } + + if (ImGui.TreeNode("View Raw History")) + { + if (ImGui.BeginChild("RawDataList", new System.Numerics.Vector2(0, 150), true)) + { + for (int i = 0; i < fullDataArray.Length; i++) + { + ImGui.Text($"[{i:000}] {fullDataArray[i]:F2} ms"); + } + + ImGui.EndChild(); + } + ImGui.TreePop(); + } + } +} \ No newline at end of file diff --git a/src/TextToTalk/UI/Windows/StylesWindow.cs b/src/TextToTalk/UI/Windows/StylesWindow.cs index 2970ce8..d00d99c 100644 --- a/src/TextToTalk/UI/Windows/StylesWindow.cs +++ b/src/TextToTalk/UI/Windows/StylesWindow.cs @@ -29,11 +29,6 @@ public interface IVoiceStylesWindow void Draw(IConfigUIDelegates helpers); } - public interface IWindowController - { - void ToggleStyle(); - } - public class VoiceStyles : Window { private readonly VoiceBackendManager backendManager; diff --git a/src/TextToTalk/VoicePresetConfiguration.cs b/src/TextToTalk/VoicePresetConfiguration.cs index f116042..827ebbf 100644 --- a/src/TextToTalk/VoicePresetConfiguration.cs +++ b/src/TextToTalk/VoicePresetConfiguration.cs @@ -1,14 +1,15 @@ -using System; +using Newtonsoft.Json; +using System; using System.Collections.Generic; using System.IO; using System.Linq; -using Newtonsoft.Json; using TextToTalk.Backends; using TextToTalk.Backends.Azure; using TextToTalk.Backends.ElevenLabs; using TextToTalk.Backends.GoogleCloud; using TextToTalk.Backends.Kokoro; using TextToTalk.Backends.OpenAI; +using TextToTalk.Backends.Piper; using TextToTalk.Backends.Polly; using TextToTalk.Backends.System; using TextToTalk.Backends.Uberduck; @@ -254,8 +255,6 @@ private static VoicePreset RepairPreset(IDictionary corrupted) Id = Convert.ToInt32(GetNullableValue(corrupted, "Id")), Name = GetNullableValue(corrupted, "Name"), Gender = GetNullableValue(corrupted, "Gender"), - SampleRate = Convert.ToInt32(GetNullableValue(corrupted, "SampleRate")), - Pitch = Convert.ToSingle(GetNullableValue(corrupted, "Pitch")), PlaybackRate = Convert.ToSingle(GetNullableValue(corrupted, "PlaybackRate")), Volume = Convert.ToSingle(GetNullableValue(corrupted, "Volume")), Locale = GetNullableValue(corrupted, "Locale"), @@ -271,6 +270,17 @@ private static VoicePreset RepairPreset(IDictionary corrupted) InternalName = GetNullableValue(corrupted, "InternalName"), EnabledBackend = TTSBackend.Kokoro }, + TTSBackend.Piper => new PiperVoicePreset + { + Id = Convert.ToInt32(GetNullableValue(corrupted, "Id")), + Speed = Convert.ToSingle(GetNullableValue(corrupted, "Speed")), + Volume = Convert.ToSingle(GetNullableValue(corrupted, "Volume")), + Name = GetNullableValue(corrupted, "Name"), + InternalName = GetNullableValue(corrupted, "InternalName"), + ModelPath = GetNullableValue(corrupted, "ModelPath"), + + EnabledBackend = TTSBackend.Piper + }, _ => throw new ArgumentOutOfRangeException($"{backendCorrupt}"), }; } diff --git a/src/TextToTalk/packages.lock.json b/src/TextToTalk/packages.lock.json index 0ca8f09..a839828 100644 --- a/src/TextToTalk/packages.lock.json +++ b/src/TextToTalk/packages.lock.json @@ -31,12 +31,12 @@ }, "Google.Cloud.TextToSpeech.V1": { "type": "Direct", - "requested": "[3.9.0, )", - "resolved": "3.9.0", - "contentHash": "JpejhPzzEQ6rdaf0nsjjJwj1CJb8Zs0x+TH27+A17KF2g0NqrgtAbpkUZTiGlQHhOzJSF1lB3amrQhbGjozJ3A==", + "requested": "[3.17.0, )", + "resolved": "3.17.0", + "contentHash": "27vM1NEBmCqAwqagwS0aEHfRBrFy7z6Ef+BblwKMaxtUUY0amdUdeXLY/PU8RSIHtJoan1K6ZKIS6YYqzgp77g==", "dependencies": { - "Google.Api.Gax.Grpc": "[4.9.0, 5.0.0)", - "Google.LongRunning": "[3.3.0, 4.0.0)" + "Google.Api.Gax.Grpc": "[4.12.1, 5.0.0)", + "Google.LongRunning": "[3.5.0, 4.0.0)" } }, "KokoroSharp.CPU": { @@ -77,6 +77,27 @@ "NAudio.WinMM": "2.2.1" } }, + "OpenAI": { + "type": "Direct", + "requested": "[2.8.0, )", + "resolved": "2.8.0", + "contentHash": "KcYpZ9IhuxFD2hGAJlL5vABtkr00CjeJU0SY8CjZQyzvzkzLop8jhdX3iDvteVJg6e3y4TEiY+Kti4gDJAagnA==", + "dependencies": { + "System.ClientModel": "1.8.1" + } + }, + "PiperSharp": { + "type": "Direct", + "requested": "[1.0.6, )", + "resolved": "1.0.6", + "contentHash": "g68TbampKc0ATx80nur6LHHrhIpXvmioIVuwAuWKcjTXTB2tf+Klk4JPwzWZRo+DRSR4kS370eh+davEQVR0cw==", + "dependencies": { + "NAudio": "2.2.1", + "NAudio.Core": "2.2.1", + "Newtonsoft.Json": "13.0.1", + "SharpCompress": "0.36.0" + } + }, "R3": { "type": "Direct", "requested": "[1.2.9, )", @@ -135,103 +156,103 @@ }, "Google.Api.CommonProtos": { "type": "Transitive", - "resolved": "2.16.0", - "contentHash": "37MuZrE9AAqHAdYgFLoTHydAiXDRriQZGVKEg6fr6ASnrY5GtauYXnQrGk5x2K3NmYzEXe+wkpaPVmxjb3NKjg==", + "resolved": "2.17.0", + "contentHash": "elfQPknFr495hm7vdy6ZlgyQh6yzZq9TU7sS35L/Fj/fqjM/mUGau9gVJLhvQEtUlPjtR80hpn/m9HvBMyCXIw==", "dependencies": { - "Google.Protobuf": "[3.28.2, 4.0.0)" + "Google.Protobuf": "[3.31.1, 4.0.0]" } }, "Google.Api.Gax": { "type": "Transitive", - "resolved": "4.9.0", - "contentHash": "fjHHYcQ99u0ztqwT537rvVtJMdDy6G2VHBZ+F1cBjDGYNVZfrpk40DMQ/OpUGToT9ZGHVirhh3eJ73bw2ANVPQ==", + "resolved": "4.12.1", + "contentHash": "G62dRNOv5DolfRviT6CCrL2a5nZ/CWWdRzhADkGnpCkYSOc3QnH5xxRvZiOKuHU8weJ/pAqAqrj7+T9IWdlu2Q==", "dependencies": { "Microsoft.Bcl.AsyncInterfaces": "6.0.0", - "Newtonsoft.Json": "13.0.3" + "Newtonsoft.Json": "13.0.4" } }, "Google.Api.Gax.Grpc": { "type": "Transitive", - "resolved": "4.9.0", - "contentHash": "ToCx/0cs+wJ9j7vzKRcPAKneJVZrz/s9JhW9QsFx1dar9WzTxawQZ8xTjyieSy8tY0UiYCL1qYkn/iRrklYnSA==", + "resolved": "4.12.1", + "contentHash": "W3LjuitOWxWyvbwqeHvpgp0LdshEiTnw/pneDAfAhQ02VgU2gVEzSXfGNPsvL8hDPBXjngR/fWNme8Kungwwkw==", "dependencies": { - "Google.Api.CommonProtos": "2.16.0", - "Google.Api.Gax": "4.9.0", - "Google.Apis.Auth": "1.68.0", - "Grpc.Auth": "2.66.0", - "Grpc.Core.Api": "2.66.0", - "Grpc.Net.Client": "2.66.0", + "Google.Api.CommonProtos": "2.17.0", + "Google.Api.Gax": "4.12.1", + "Google.Apis.Auth": "1.72.0", + "Grpc.Auth": "[2.71.0, 3.0.0)", + "Grpc.Core.Api": "[2.71.0, 3.0.0)", + "Grpc.Net.Client": "[2.71.0, 3.0.0)", "Microsoft.Extensions.DependencyInjection.Abstractions": "6.0.0" } }, "Google.Apis": { "type": "Transitive", - "resolved": "1.68.0", - "contentHash": "s2MymhdpH+ybZNBeZ2J5uFgFHApBp+QXf9FjZSdM1lk/vx5VqIknJwnaWiuAzXxPrLEkesX0Q+UsiWn39yZ9zw==", + "resolved": "1.72.0", + "contentHash": "QbSJ08W7QuqsfzDPOZDHl1aFzCYwMcfBoHqQRh7koglwDN5WacShCKYMpU/zR1Pf3h3sH6JTGEeM/txAxaJuEg==", "dependencies": { - "Google.Apis.Core": "1.68.0" + "Google.Apis.Core": "1.72.0" } }, "Google.Apis.Auth": { "type": "Transitive", - "resolved": "1.68.0", - "contentHash": "hFx8Qz5bZ4w0hpnn4tSmZaaFpjAMsgVElZ+ZgVLUZ2r9i+AKcoVgwiNfv1pruNS5cCvpXqhKECbruBCfRezPHA==", + "resolved": "1.72.0", + "contentHash": "RBoFwFKBHKUjuyJf2weEnqICQLaY0TdIrdFv2yC8bsiR2VFYxizOn3C/qN1FWCCb0Uh9GhW+zwAV1yUxPjiocw==", "dependencies": { - "Google.Apis": "1.68.0", - "Google.Apis.Core": "1.68.0", + "Google.Apis": "1.72.0", + "Google.Apis.Core": "1.72.0", "System.Management": "7.0.2" } }, "Google.Apis.Core": { "type": "Transitive", - "resolved": "1.68.0", - "contentHash": "pAqwa6pfu53UXCR2b7A/PAPXeuVg6L1OFw38WckN27NU2+mf+KTjoEg2YGv/f0UyKxzz7DxF1urOTKg/6dTP9g==", + "resolved": "1.72.0", + "contentHash": "ZmYX1PU0vTKFT42c7gp4zaYcb/0TFAXrt9qw8yEz0wjvaug85+/WddlPTfT525Qei8iIUsF6t4bHYrsb2O7crg==", "dependencies": { - "Newtonsoft.Json": "13.0.3" + "Newtonsoft.Json": "13.0.4" } }, "Google.LongRunning": { "type": "Transitive", - "resolved": "3.3.0", - "contentHash": "F2SZ83Jo466Wj/s1Z7QhIAmWBXxJZQyXZpcx0P8BR7d6s0FAj67vQjeUPESSJcvsy8AqYiYBhkUr2YpZhTQeHg==", + "resolved": "3.5.0", + "contentHash": "W8xO6FA+rG8WjKOsyIjTKjeKLcyCrjBBYeEdZ4QBkKQcxmRczbrfKhKQmdorb2V35CqXeeTbue5Na6Zkgyv8ow==", "dependencies": { - "Google.Api.Gax.Grpc": "[4.8.0, 5.0.0)" + "Google.Api.Gax.Grpc": "[4.12.1, 5.0.0)" } }, "Google.Protobuf": { "type": "Transitive", - "resolved": "3.28.2", - "contentHash": "Z86ZKAB+v1B/m0LTM+EVamvZlYw/g3VND3/Gs4M/+aDIxa2JE9YPKjDxTpf0gv2sh26hrve3eI03brxBmzn92g==" + "resolved": "3.31.1", + "contentHash": "gSnJbUmGiOTdWddPhqzrEscHq9Ls6sqRDPB9WptckyjTUyx70JOOAaDLkFff8gManZNN3hllQ4aQInnQyq/Z/A==" }, "Grpc.Auth": { "type": "Transitive", - "resolved": "2.66.0", - "contentHash": "FRQlhMAcHf0GjAXIfhN6RydfZncLLXNNTOtpLL1bt57kp59vu40faW+dr6Vwl7ef/IUFfF38aiB5jvhAA/9Aow==", + "resolved": "2.71.0", + "contentHash": "t2aGh/pMgqmc3GimtYfC7VcgVY/VSbk6SLH+61wewsgK45tzxxD9nYYItT5bpLn7fbebirmHXfgJcVKIArd0cg==", "dependencies": { - "Google.Apis.Auth": "1.68.0", - "Grpc.Core.Api": "2.66.0" + "Google.Apis.Auth": "1.69.0", + "Grpc.Core.Api": "2.71.0" } }, "Grpc.Core.Api": { "type": "Transitive", - "resolved": "2.66.0", - "contentHash": "HsjsQVAHe4hqP4t4rpUnmq+MZvPdyrlPsWF4T5fbMvyP3o/lMV+KVJfDlaNH8+v0aGQTVT3EsDFufbhaWb52cw==" + "resolved": "2.71.0", + "contentHash": "QquqUC37yxsDzd1QaDRsH2+uuznWPTS8CVE2Yzwl3CvU4geTNkolQXoVN812M2IwT6zpv3jsZRc9ExJFNFslTg==" }, "Grpc.Net.Client": { "type": "Transitive", - "resolved": "2.66.0", - "contentHash": "GwkSsssXFgN9+M2U+UQWdErf61sn1iqgP+2NRBlDXATcP9vlxda0wySxd/eIL8U522+SnyFNUXlvQ5tAzGk9cA==", + "resolved": "2.71.0", + "contentHash": "U1vr20r5ngoT9nlb7wejF28EKN+taMhJsV9XtK9MkiepTZwnKxxiarriiMfCHuDAfPUm9XUjFMn/RIuJ4YY61w==", "dependencies": { - "Grpc.Net.Common": "2.66.0", + "Grpc.Net.Common": "2.71.0", "Microsoft.Extensions.Logging.Abstractions": "6.0.0" } }, "Grpc.Net.Common": { "type": "Transitive", - "resolved": "2.66.0", - "contentHash": "YJpQpIvpo0HKlsG6SHwaieyji08qfv0DdEDIewCAA0egQY08637sHOj1netLGUhzBEsCqlGC3e92TZ2uqhxnvw==", + "resolved": "2.71.0", + "contentHash": "v0c8R97TwRYwNXlC8GyRXwYTCNufpDfUtj9la+wUrZFzVWkFJuNAltU+c0yI3zu0jl54k7en6u2WKgZgd57r2Q==", "dependencies": { - "Grpc.Core.Api": "2.66.0" + "Grpc.Core.Api": "2.71.0" } }, "KokoroSharp": { @@ -258,13 +279,16 @@ }, "Microsoft.Extensions.DependencyInjection.Abstractions": { "type": "Transitive", - "resolved": "6.0.0", - "contentHash": "xlzi2IYREJH3/m6+lUrQlujzX8wDitm4QGnUu6kUXTQAWPuZY8i+ticFJbzfqaetLA6KR/rO6Ew/HuYD+bxifg==" + "resolved": "8.0.2", + "contentHash": "3iE7UF7MQkCv1cxzCahz+Y/guQbTqieyxyaWKhrRO91itI9cOKO76OHeQDahqG4MmW5umr3CcCvGmK92lWNlbg==" }, "Microsoft.Extensions.Logging.Abstractions": { "type": "Transitive", - "resolved": "6.0.0", - "contentHash": "/HggWBbTwy8TgebGSX5DBZ24ndhzi93sHUBDvP1IxbZD7FDokYzdAr6+vbWGjw2XAfR2EJ1sfKUotpjHnFWPxA==" + "resolved": "8.0.3", + "contentHash": "dL0QGToTxggRLMYY4ZYX5AMwBb+byQBd/5dMiZE07Nv73o6I5Are3C7eQTh7K2+A4ct0PVISSr7TZANbiNb2yQ==", + "dependencies": { + "Microsoft.Extensions.DependencyInjection.Abstractions": "8.0.2" + } }, "Microsoft.ML.OnnxRuntime": { "type": "Transitive", @@ -331,8 +355,8 @@ }, "Newtonsoft.Json": { "type": "Transitive", - "resolved": "13.0.3", - "contentHash": "HrC5BXdl00IP9zeV+0Z848QWPAoCr9P3bDEZguI+gkLcBKAOxix/tLEAAHC+UvDNPv4a2d18lOReHMOagPa+zQ==" + "resolved": "13.0.4", + "contentHash": "pdgNNMai3zv51W5aq268sujXUyx7SNdE2bj1wZcWjAQrKMFZV260lbqYop1d2GM67JI1huLRwxo9ZqnfF/lC6A==" }, "NumSharp": { "type": "Transitive", @@ -358,6 +382,23 @@ "resolved": "5.0.0-pre.13", "contentHash": "65qbZS49AfrTM6jtZ2RDTWAzLe13ywCXIiSP5QrAJLmZT6sQqHGd1LfFXLhx8Ccp77qy7qh/LHsxpUOlkgZTCg==" }, + "SharpCompress": { + "type": "Transitive", + "resolved": "0.36.0", + "contentHash": "48am//T6Ou+GmyPmBaxaFN1ym0VNidRcBeANr9+OYTzpKRz8QMGzAkHVkCV30lFQ/gnWqGr50AuebahpG1C6xA==", + "dependencies": { + "ZstdSharp.Port": "0.7.4" + } + }, + "System.ClientModel": { + "type": "Transitive", + "resolved": "1.8.1", + "contentHash": "4oUQgw/vaO4FBOk3YsH40hbrjxRED1l95rRLvTMtHXfQxapXya9IfPpm/KgwValFFtYTfYGFOs/qzGmGyexicQ==", + "dependencies": { + "Microsoft.Extensions.Logging.Abstractions": "8.0.3", + "System.Memory.Data": "8.0.1" + } + }, "System.CodeDom": { "type": "Transitive", "resolved": "7.0.0", @@ -371,6 +412,11 @@ "System.CodeDom": "7.0.0" } }, + "System.Memory.Data": { + "type": "Transitive", + "resolved": "8.0.1", + "contentHash": "BVYuec3jV23EMRDeR7Dr1/qhx7369dZzJ9IWy2xylvb4YfXsrUxspWc4UWYid/tj4zZK58uGZqn2WQiaDMhmAg==" + }, "System.Numerics.Tensors": { "type": "Transitive", "resolved": "9.0.5", @@ -381,6 +427,11 @@ "resolved": "15.3.0", "contentHash": "F93japYa9YrJ59AZGhgdaUGHN7ITJ55FBBg/D/8C0BDgahv/rQD6MOSwHxOJJpon1kYyslVbeBrQ2wcJhox01w==" }, + "ZstdSharp.Port": { + "type": "Transitive", + "resolved": "0.7.4", + "contentHash": "ziptnotpUJr51afwXJQ5Wc03dvDiZAdmxS08s1g7SHn/VzbyZUXdH6yORk/zaNjzUOEE6pVZ0Nqztab0rYROgQ==" + }, "texttotalk.data": { "type": "Project", "dependencies": {