From d690c11fb15efe636e9b873d0eeb7c70d2b2618d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=81=A5=E4=BB=99?= Date: Mon, 9 Feb 2026 11:57:59 +0800 Subject: [PATCH] feat(model/cosyvoice): add hot_fix and max_prompt_audio_length param in enrollment --- dashscope/audio/tts_v2/enrollment.py | 4 ++ dashscope/audio/tts_v2/speech_synthesizer.py | 53 ++++++++++++++++++++ dashscope/multimodal/multimodal_dialog.py | 3 +- 3 files changed, 58 insertions(+), 2 deletions(-) diff --git a/dashscope/audio/tts_v2/enrollment.py b/dashscope/audio/tts_v2/enrollment.py index 35f2c54..e57e869 100644 --- a/dashscope/audio/tts_v2/enrollment.py +++ b/dashscope/audio/tts_v2/enrollment.py @@ -88,6 +88,7 @@ def create_voice( prefix: str, url: str, language_hints: List[str] = None, + max_prompt_audio_length: float = None, ) -> str: """ 创建新克隆音色 @@ -95,6 +96,7 @@ def create_voice( param: prefix 音色自定义前缀,仅允许数字和小写字母,小于十个字符。 param: url 用于克隆的音频文件url param: language_hints 克隆音色目标语言 + param: max_prompt_audio_length 音频预处理输出的prompt audio最长长度。单位为秒。默认为10s。 return: voice_id """ @@ -106,6 +108,8 @@ def create_voice( } if language_hints is not None: input_params["language_hints"] = language_hints + if max_prompt_audio_length is not None: + input_params["max_prompt_audio_length"] = max_prompt_audio_length response = self.__call_with_input(input_params) self._last_request_id = response.request_id if response.status_code == 200: diff --git a/dashscope/audio/tts_v2/speech_synthesizer.py b/dashscope/audio/tts_v2/speech_synthesizer.py index 66a2013..8ef7420 100644 --- a/dashscope/audio/tts_v2/speech_synthesizer.py +++ b/dashscope/audio/tts_v2/speech_synthesizer.py @@ -8,7 +8,9 @@ import threading import time import uuid +from dataclasses import dataclass from enum import Enum, unique +from typing import Dict, List, Optional import websocket @@ -26,6 +28,35 @@ ) +@dataclass +class HotFix: + """ + Hot fix parameters for pronunciation and text replacement. + + Attributes: + pronunciation: List of pronunciation, e.g., [{"草地": "cao3 di4"}] + replace: List of text replacement, e.g., [{"草地": "草弟"}] + + Example: + hot_fix = HotFix( + pronunciation=[{"草地": "cao3 di4"}], + replace=[{"草地": "草弟"}] + ) + hot_fix_dict = hot_fix.to_dict() + """ + + pronunciation: Optional[List[Dict[str, str]]] = None + replace: Optional[List[Dict[str, str]]] = None + + def to_dict(self) -> Dict[str, List[Dict[str, str]]]: + result = {} + if self.pronunciation is not None: + result["pronunciation"] = self.pronunciation + if self.replace is not None: + result["replace"] = self.replace + return result + + class ResultCallback: """ An interface that defines callback methods for getting speech synthesis results. # noqa E501 @@ -246,6 +277,7 @@ def __init__( # pylint: disable=redefined-builtin callback: ResultCallback = None, workspace=None, url=None, + hot_fix=None, additional_params=None, ): """ @@ -282,6 +314,14 @@ def __init__( # pylint: disable=redefined-builtin The language hints of the synthesizer. supported language: zh, en. additional_params: Dict Additional parameters for the Dashscope API. + hot_fix: Dict or HotFix + Hot fix parameters for pronunciation and text replacement. + Example: { + "pronunciation": [{"草地": "cao3 di4"}], + "replace": [{"草地": "草弟"}] + } + enable_markdown_filter: bool + Whether to enable markdown filter. should be set into additional_params. """ self.ws = None self.start_event = threading.Event() @@ -316,6 +356,7 @@ def __init__( # pylint: disable=redefined-builtin workspace, url, additional_params, + hot_fix, ) def __send_str(self, data: str): @@ -404,6 +445,7 @@ def __update_params( # pylint: disable=redefined-builtin url=None, additional_params=None, close_ws_after_use=True, + hot_fix=None, ): if model is None: raise ModelRequired("Model is required!") @@ -417,6 +459,17 @@ def __update_params( # pylint: disable=redefined-builtin raise InputRequired("apikey is required!") self.headers = headers self.workspace = workspace + + # Merge hot_fix into additional_params + if hot_fix is not None: + if additional_params is None: + additional_params = {} + # Support both HotFix instance and dict + if isinstance(hot_fix, HotFix): + additional_params["hot_fix"] = hot_fix.to_dict() + else: + additional_params["hot_fix"] = hot_fix + self.additional_params = additional_params self.model = model self.voice = voice diff --git a/dashscope/multimodal/multimodal_dialog.py b/dashscope/multimodal/multimodal_dialog.py index f7ed847..a8920ee 100644 --- a/dashscope/multimodal/multimodal_dialog.py +++ b/dashscope/multimodal/multimodal_dialog.py @@ -363,7 +363,7 @@ def _send_start_request( self._send_text_frame(_start_json) def _run_forever(self): - self.ws.run_forever(ping_interval=20, ping_timeout=10) + self.ws.run_forever(ping_interval=None, ping_timeout=None) def _connect(self, api_key: str): """初始化WebSocket连接并发送启动请求。""" @@ -376,7 +376,6 @@ def _connect(self, api_key: str): on_close=self._on_close, ) self.thread = threading.Thread(target=self._run_forever) - self.ws.ping_interval = 3 self.thread.daemon = True self.thread.start()