From 14f2f66831d490ddc821e1624bee17d344be4f1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=81=A5=E4=BB=99?= Date: Wed, 28 Jan 2026 14:45:27 +0800 Subject: [PATCH] feat(model/cosyvoice): support hot_fix params and max_prompt_audio_length --- .../MultiModalConversationParam.java | 4 ++ .../audio/tts/SpeechSynthesisApiKeywords.java | 8 +++ .../dashscope/audio/ttsv2/ParamHotFix.java | 62 +++++++++++++++++++ .../audio/ttsv2/SpeechSynthesisParam.java | 17 +++++ .../enrollment/VoiceEnrollmentParam.java | 6 ++ .../enrollment/VoiceEnrollmentService.java | 1 + .../dashscope/TestTtsV2SpeechSynthesizer.java | 12 ++++ 7 files changed, 110 insertions(+) create mode 100644 src/main/java/com/alibaba/dashscope/audio/ttsv2/ParamHotFix.java diff --git a/src/main/java/com/alibaba/dashscope/aigc/multimodalconversation/MultiModalConversationParam.java b/src/main/java/com/alibaba/dashscope/aigc/multimodalconversation/MultiModalConversationParam.java index 2e69b77..1e9cdfb 100644 --- a/src/main/java/com/alibaba/dashscope/aigc/multimodalconversation/MultiModalConversationParam.java +++ b/src/main/java/com/alibaba/dashscope/aigc/multimodalconversation/MultiModalConversationParam.java @@ -182,6 +182,10 @@ public JsonObject getInput() { jsonObject.addProperty(ApiKeywords.VOICE, voice.getValue()); } + if (parameters != null && !parameters.isEmpty() && parameters.containsKey(ApiKeywords.VOICE)) { + jsonObject.addProperty(ApiKeywords.VOICE, (String) parameters.get(ApiKeywords.VOICE)); + } + if (languageType != null) { jsonObject.addProperty(ApiKeywords.LANGUAGE_TYPE, languageType); } diff --git a/src/main/java/com/alibaba/dashscope/audio/tts/SpeechSynthesisApiKeywords.java b/src/main/java/com/alibaba/dashscope/audio/tts/SpeechSynthesisApiKeywords.java index d943bbe..6d792fc 100644 --- a/src/main/java/com/alibaba/dashscope/audio/tts/SpeechSynthesisApiKeywords.java +++ b/src/main/java/com/alibaba/dashscope/audio/tts/SpeechSynthesisApiKeywords.java @@ -6,7 +6,9 @@ public class SpeechSynthesisApiKeywords { public static final String TEXT_TYPE = "text_type"; public static final String FORMAT = "format"; + public static final String BIT_RATE = "bit_rate"; + public static final String VOICE = "voice"; public static final String SAMPLE_RATE = "sample_rate"; @@ -21,6 +23,12 @@ public class SpeechSynthesisApiKeywords { public static final String PHONEME_TIMESTAMP = "phoneme_timestamp_enabled"; + public static final String HOT_FIX = "hot_fix"; + + public static final String PRONUNCIATION = "pronunciation"; + + public static final String REPLACE = "replace"; + public static final String SENTENCE = "sentence"; public static final String WORDS = "words"; diff --git a/src/main/java/com/alibaba/dashscope/audio/ttsv2/ParamHotFix.java b/src/main/java/com/alibaba/dashscope/audio/ttsv2/ParamHotFix.java new file mode 100644 index 0000000..6a34730 --- /dev/null +++ b/src/main/java/com/alibaba/dashscope/audio/ttsv2/ParamHotFix.java @@ -0,0 +1,62 @@ +// Copyright (c) Alibaba, Inc. and its affiliates. + +package com.alibaba.dashscope.audio.ttsv2; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import lombok.AllArgsConstructor; +import lombok.Data; + +/** Hot fix configuration for speech synthesis, including pronunciation and replace rules. */ +@Data +public class ParamHotFix { + + /** Pronunciation rules to customize specific words. */ + private List pronunciation; + + /** Replace rules to replace specific words with others. */ + private List replace; + + public ArrayList getPronunciation() { + if (pronunciation == null || pronunciation.isEmpty()) { + return null; + } + ArrayList pronunciationList = new ArrayList<>(); + for (PronunciationItem item : pronunciation) { + HashMap pronunciationItem = new HashMap<>(); + pronunciationItem.put(item.getText(), item.getPinyin()); + pronunciationList.add(pronunciationItem); + } + + return pronunciationList; + } + + public ArrayList getReplace() { + if (replace == null || replace.isEmpty()) { + return null; + } + ArrayList replaceList = new ArrayList<>(); + for (ReplaceItem item : replace) { + HashMap replaceItem = new HashMap<>(); + replaceItem.put(item.getText(), item.getReplacement()); + replaceList.add(replaceItem); + } + + return replaceList; + } + + @Data + @AllArgsConstructor + public static class PronunciationItem { + private String text; + private String pinyin; + } + + @Data + @AllArgsConstructor + public static class ReplaceItem { + private String text; + private String replacement; + } +} diff --git a/src/main/java/com/alibaba/dashscope/audio/ttsv2/SpeechSynthesisParam.java b/src/main/java/com/alibaba/dashscope/audio/ttsv2/SpeechSynthesisParam.java index f1c535d..68c7a6f 100644 --- a/src/main/java/com/alibaba/dashscope/audio/ttsv2/SpeechSynthesisParam.java +++ b/src/main/java/com/alibaba/dashscope/audio/ttsv2/SpeechSynthesisParam.java @@ -55,6 +55,9 @@ public class SpeechSynthesisParam extends FullDuplexServiceParam { @Builder.Default private List languageHints = null; /** synthesis style */ @Builder.Default private int style = 0; + /** Hot fix configuration for pronunciation and replace rules. */ + @Builder.Default private ParamHotFix hotFix = null; + @Override public Map getParameters() { @@ -83,6 +86,20 @@ public Map getParameters() { if (getStyle() != 0) { params.put(SpeechSynthesisApiKeywords.STYLE, getStyle()); } + // Add hot fix parameters if present + if (getHotFix() != null) { + Map hotFixParams = new HashMap<>(); + if (getHotFix().getPronunciation() != null && !getHotFix().getPronunciation().isEmpty()) { + hotFixParams.put(SpeechSynthesisApiKeywords.PRONUNCIATION, getHotFix().getPronunciation()); + } + if (getHotFix().getReplace() != null && !getHotFix().getReplace().isEmpty()) { + hotFixParams.put(SpeechSynthesisApiKeywords.REPLACE, getHotFix().getReplace()); + } + if (!hotFixParams.isEmpty()) { + params.put(SpeechSynthesisApiKeywords.HOT_FIX, hotFixParams); + } + } + params.putAll(parameters); return params; } diff --git a/src/main/java/com/alibaba/dashscope/audio/ttsv2/enrollment/VoiceEnrollmentParam.java b/src/main/java/com/alibaba/dashscope/audio/ttsv2/enrollment/VoiceEnrollmentParam.java index a69ab70..e741c0e 100644 --- a/src/main/java/com/alibaba/dashscope/audio/ttsv2/enrollment/VoiceEnrollmentParam.java +++ b/src/main/java/com/alibaba/dashscope/audio/ttsv2/enrollment/VoiceEnrollmentParam.java @@ -8,6 +8,7 @@ import java.nio.ByteBuffer; import java.security.InvalidParameterException; import java.util.List; + import lombok.Data; import lombok.EqualsAndHashCode; import lombok.experimental.SuperBuilder; @@ -25,6 +26,8 @@ public class VoiceEnrollmentParam extends HalfDuplexServiceParam { private int pageIndex; private int pageSize; + /** Maximum length of prompt audio in seconds. */ + private float maxPromptAudioLength = 10.0f; protected VoiceEnrollmentParam(HalfDuplexServiceParamBuilder b) { super(b); @@ -50,6 +53,9 @@ public JsonObject getInput() { if (languageHints != null) { input.add("language_hints", JsonUtils.toJsonArray(languageHints)); } + if (maxPromptAudioLength > 0) { + input.addProperty("max_prompt_audio_length", maxPromptAudioLength); + } break; case LIST: input.addProperty(ApiKeywords.ACTION, operationType.getValue()); diff --git a/src/main/java/com/alibaba/dashscope/audio/ttsv2/enrollment/VoiceEnrollmentService.java b/src/main/java/com/alibaba/dashscope/audio/ttsv2/enrollment/VoiceEnrollmentService.java index df3246c..23efe5d 100644 --- a/src/main/java/com/alibaba/dashscope/audio/ttsv2/enrollment/VoiceEnrollmentService.java +++ b/src/main/java/com/alibaba/dashscope/audio/ttsv2/enrollment/VoiceEnrollmentService.java @@ -144,6 +144,7 @@ public Voice createVoice( .languageHints(customParam.getLanguageHints()) .headers(customParam.getHeaders()) .resources(customParam.getResources()) + .maxPromptAudioLength(customParam.getMaxPromptAudioLength()) .parameters(customParam.getParameters()) .workspace(customParam.getWorkspace()) .build(); diff --git a/src/test/java/com/alibaba/dashscope/TestTtsV2SpeechSynthesizer.java b/src/test/java/com/alibaba/dashscope/TestTtsV2SpeechSynthesizer.java index 0320038..c6858cf 100644 --- a/src/test/java/com/alibaba/dashscope/TestTtsV2SpeechSynthesizer.java +++ b/src/test/java/com/alibaba/dashscope/TestTtsV2SpeechSynthesizer.java @@ -5,6 +5,7 @@ import static org.junit.Assert.assertEquals; import com.alibaba.dashscope.audio.tts.SpeechSynthesisResult; +import com.alibaba.dashscope.audio.ttsv2.ParamHotFix; import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisAudioFormat; import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam; import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer; @@ -129,6 +130,16 @@ public void testStreamingCall() { // 获取 URL String url = mockServer.url("/binary").toString(); + ParamHotFix hotFix = new ParamHotFix(); + ArrayList pronunciations = new ArrayList<>(); + pronunciations.add(new ParamHotFix.PronunciationItem("今天", "jin1 tian1")); + pronunciations.add(new ParamHotFix.PronunciationItem("草地", "cao3 di4")); + hotFix.setPronunciation(pronunciations); + + ArrayList replaces = new ArrayList<>(); + replaces.add(new ParamHotFix.ReplaceItem("草地", "草弟")); + replaces.add(new ParamHotFix.ReplaceItem("惠州", "汇州")); + hotFix.setReplace(replaces); // 在真实世界中,你会在这里做 HTTP 请求,并得到响应 System.out.println("Mock Server is running at: " + url); @@ -138,6 +149,7 @@ public void testStreamingCall() { .model("cosyvoice-v1") .voice("longxiaochun") .format(SpeechSynthesisAudioFormat.MP3_16000HZ_MONO_128KBPS) + .hotFix(hotFix) .build(); SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, callback); synthesizer.setStartedTimeout(1000);