From d690c11fb15efe636e9b873d0eeb7c70d2b2618d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=81=A5=E4=BB=99?= <songsong.sss@alibaba-inc.com>
Date: Mon, 9 Feb 2026 11:57:59 +0800
Subject: [PATCH] feat(model/cosyvoice): add hot_fix and
 max_prompt_audio_length param in enrollment

---
 dashscope/audio/tts_v2/enrollment.py         |  4 ++
 dashscope/audio/tts_v2/speech_synthesizer.py | 53 ++++++++++++++++++++
 dashscope/multimodal/multimodal_dialog.py    |  3 +-
 3 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/dashscope/audio/tts_v2/enrollment.py b/dashscope/audio/tts_v2/enrollment.py
index 35f2c54..e57e869 100644
--- a/dashscope/audio/tts_v2/enrollment.py
+++ b/dashscope/audio/tts_v2/enrollment.py
@@ -88,6 +88,7 @@ def create_voice(
         prefix: str,
         url: str,
         language_hints: List[str] = None,
+        max_prompt_audio_length: float = None,
     ) -> str:
         """
         创建新克隆音色
@@ -95,6 +96,7 @@ def create_voice(
         param: prefix 音色自定义前缀，仅允许数字和小写字母，小于十个字符。
         param: url 用于克隆的音频文件url
         param: language_hints 克隆音色目标语言
+        param: max_prompt_audio_length 音频预处理输出的prompt audio最长长度。单位为秒。默认为10s。
         return: voice_id
         """
 
@@ -106,6 +108,8 @@ def create_voice(
         }
         if language_hints is not None:
             input_params["language_hints"] = language_hints
+        if max_prompt_audio_length is not None:
+            input_params["max_prompt_audio_length"] = max_prompt_audio_length
         response = self.__call_with_input(input_params)
         self._last_request_id = response.request_id
         if response.status_code == 200:
diff --git a/dashscope/audio/tts_v2/speech_synthesizer.py b/dashscope/audio/tts_v2/speech_synthesizer.py
index 66a2013..8ef7420 100644
--- a/dashscope/audio/tts_v2/speech_synthesizer.py
+++ b/dashscope/audio/tts_v2/speech_synthesizer.py
@@ -8,7 +8,9 @@
 import threading
 import time
 import uuid
+from dataclasses import dataclass
 from enum import Enum, unique
+from typing import Dict, List, Optional
 
 import websocket
 
@@ -26,6 +28,35 @@
 )
 
 
+@dataclass
+class HotFix:
+    """
+    Hot fix parameters for pronunciation and text replacement.
+
+    Attributes:
+        pronunciation: List of pronunciation, e.g., [{"草地": "cao3 di4"}]
+        replace: List of text replacement, e.g., [{"草地": "草弟"}]
+
+    Example:
+        hot_fix = HotFix(
+             pronunciation=[{"草地": "cao3 di4"}],
+             replace=[{"草地": "草弟"}]
+         )
+         hot_fix_dict = hot_fix.to_dict()
+    """
+
+    pronunciation: Optional[List[Dict[str, str]]] = None
+    replace: Optional[List[Dict[str, str]]] = None
+
+    def to_dict(self) -> Dict[str, List[Dict[str, str]]]:
+        result = {}
+        if self.pronunciation is not None:
+            result["pronunciation"] = self.pronunciation
+        if self.replace is not None:
+            result["replace"] = self.replace
+        return result
+
+
 class ResultCallback:
     """
     An interface that defines callback methods for getting speech synthesis results. # noqa E501
@@ -246,6 +277,7 @@ def __init__(  # pylint: disable=redefined-builtin
         callback: ResultCallback = None,
         workspace=None,
         url=None,
+        hot_fix=None,
         additional_params=None,
     ):
         """
@@ -282,6 +314,14 @@ def __init__(  # pylint: disable=redefined-builtin
             The language hints of the synthesizer. supported language: zh, en.
         additional_params: Dict
             Additional parameters for the Dashscope API.
+        hot_fix: Dict or HotFix
+            Hot fix parameters for pronunciation and text replacement.
+            Example: {
+                "pronunciation": [{"草地": "cao3 di4"}],
+                "replace": [{"草地": "草弟"}]
+            }
+        enable_markdown_filter: bool
+            Whether to enable markdown filter. should be set into additional_params.
         """
         self.ws = None
         self.start_event = threading.Event()
@@ -316,6 +356,7 @@ def __init__(  # pylint: disable=redefined-builtin
             workspace,
             url,
             additional_params,
+            hot_fix,
         )
 
     def __send_str(self, data: str):
@@ -404,6 +445,7 @@ def __update_params(  # pylint: disable=redefined-builtin
         url=None,
         additional_params=None,
         close_ws_after_use=True,
+        hot_fix=None,
     ):
         if model is None:
             raise ModelRequired("Model is required!")
@@ -417,6 +459,17 @@ def __update_params(  # pylint: disable=redefined-builtin
             raise InputRequired("apikey is required!")
         self.headers = headers
         self.workspace = workspace
+
+        # Merge hot_fix into additional_params
+        if hot_fix is not None:
+            if additional_params is None:
+                additional_params = {}
+            # Support both HotFix instance and dict
+            if isinstance(hot_fix, HotFix):
+                additional_params["hot_fix"] = hot_fix.to_dict()
+            else:
+                additional_params["hot_fix"] = hot_fix
+
         self.additional_params = additional_params
         self.model = model
         self.voice = voice
diff --git a/dashscope/multimodal/multimodal_dialog.py b/dashscope/multimodal/multimodal_dialog.py
index f7ed847..a8920ee 100644
--- a/dashscope/multimodal/multimodal_dialog.py
+++ b/dashscope/multimodal/multimodal_dialog.py
@@ -363,7 +363,7 @@ def _send_start_request(
         self._send_text_frame(_start_json)
 
     def _run_forever(self):
-        self.ws.run_forever(ping_interval=20, ping_timeout=10)
+        self.ws.run_forever(ping_interval=None, ping_timeout=None)
 
     def _connect(self, api_key: str):
         """初始化WebSocket连接并发送启动请求。"""
@@ -376,7 +376,6 @@ def _connect(self, api_key: str):
             on_close=self._on_close,
         )
         self.thread = threading.Thread(target=self._run_forever)
-        self.ws.ping_interval = 3
         self.thread.daemon = True
         self.thread.start()